In [1]:
import rpy2
import scipy
import logging
import warnings
import anndata
import anndata2ri
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import decoupler as dc
import scrublet as scr
import decoupler as dc
from scipy import sparse
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from typing import Optional, Union
from matplotlib.pyplot import rcParams
from functions import pathway_analyses
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.1
-----
OpenSSL                     22.0.0
PIL                         9.2.0
absl                        NA
anndata2ri                  1.1
appnope                     0.1.2
asttokens                   NA
astunparse                  1.6.3
attr                        21.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
boto3                       1.26.32
botocore                    1.29.32
bottleneck                  1.3.5
brotli                      NA
certifi                     2022.09.24
cffi                        1.15.1
cloudpickle                 2.2.0
colorama                    0.4.4
cryptography                38.0.1
cycler                      0.10.0
cython_runtime              NA
dask                        2022.11.0
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
decoupler                   1.4.0
defusedxml                  0.7.1
de

# **Consensus network analysis**

Here, we perform consensus co-expression network analysis using `hdWGCNA`. 

Consensus co-expression network analysis differs from the standard co-expression network analysis workflow by `constructing individual networks across distinct datasets`, and `then computing an integrated co-expression network`. 

This framework can be used to `identify networks that are conserved across a variety of biological conditions`, and it can also be used as a way to construct a unified network from any number of different datasets. 

Here, use this framework to identify consensus coexpression network of genes across celltypes (from the three studies) in AD.

- [**Morabito et al. bioRxiv 2022**](https://www.biorxiv.org/content/10.1101/2022.09.22.509094v1)
- [**Morabito & Miyoshi et al. Nature Genetics 2021**](https://www.nature.com/articles/s41588-021-00894-z)


In [48]:
%%R
suppressPackageStartupMessages({
    library(WGCNA)
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(readxl)


    # single-cell analysis package
    library(Seurat)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)


    library(GSEABase)
    library(GSVA) 
# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
# enableWGCNAThreads(nThreads = 4)

## **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

In [5]:
map_meta = True
deg_method =  'DESeq2-Wald'
test_names = ['late_vs_early', 'early_vs_no', 'late_vs_no', 'ad_vs_no']
filter_genes = "TRUE"
studies = ['allen_mtg', 'leng_sfg', 'leng_etc']
subject_id = {'allen_mtg': 'individualID',
             'leng_sfg': 'PatientID',
             'leng_etc': 'PatientID'}      # for leng this is `PatientID` for mathys is 'Subject', and 'individualID' for allen

gene_celltype_threshold = 0.05      # determines number of cells the gene must be expressed in 
covariates = ['None']               # list of covariates to be accounted for in regression.
gene_selection = 'custom'           # specifies the gene selection method when setting up seurat object for WGCNA. The th
celltypes = ["Inhibitory"]          #["Excitatory", "Inhibitory", "Astrocyte", "Microglia", "Oligodendrocyte", "OPC"]
gene_selection = 'custom'       # specifies the gene selection method when setting up seurat object for WGCNA. The th
                                # Posible values are "custom", "fraction", "variable"
                                # If custom, a list of genes must be passed.

gene_set_select = 'overlap'      # If gene_selection = 'custom'. This specifies how to obtain the list of
                                  # genes to pass into `SetupForWGCNA`. # The posible values are "diff_exp", "overlap", "all"

In [6]:
adatas = []
for save_prefix in studies:
    
    print('----------------------------')
    print(f'Fetching {save_prefix.upper()} data')
    print('----------------------------')

    adata = sc.read_h5ad(f'../data/processed/{save_prefix}/{save_prefix}_mapped_anndata.h5ad')
    adata.obs['study'] = save_prefix
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.X = adata.layers['counts'].copy()

    metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv' 
    meta = pd.read_csv(metadata, encoding_errors='ignore')  

    # map the pathology group to the subject id in .obs

    adata.obs['pathology.group'] = adata.obs[subject_id[save_prefix]].map(dict(zip(meta[subject_id[save_prefix]].astype(str),meta['pathology.group'])))
    
    adata = adata[adata.obs['pathology.group'].isin(['early', 'late'])]

    del adata.obsm, adata.layers, \
        adata.varm, adata.uns, \
        adata.obsp
    
    adatas.append(adata)
    
    print(adata)

print('----------------------------')
print(f'Concatenated data')
print('----------------------------')

adata_annot = anndata.concat(adatas, join='outer')
adata_annot.obs = adata_annot.obs.replace(np.nan, None)
print(adata_annot)

del adatas, adata

----------------------------
Fetching ALLEN_MTG data
----------------------------
AnnData object with n_obs × n_vars = 41366 × 17950
    obs: 'ID', 'name', 'versionNumber', 'contentType', 'dataFileSizeBytes', 'createdBy', 'createdOn', 'modifiedBy', 'modifiedOn', 'parentId', 'synapseURL', 'dataFileMD5Hex', 'assay_x', 'consortium', 'dataSubtype', 'dataType', 'fileFormat', 'grant', 'individualID_x', 'isModelSystem', 'isMultiSpecimen', 'isMultiome', 'libraryPrep', 'nucleicAcidSource_x', 'organ_x', 'platform', 'readLength', 'resourceType', 'runType', 'sex_x', 'species_x', 'specimenID', 'study', 'tissue_x', 'path', 'error', 'individualID_y', 'specimenIdSource', 'organ_y', 'tissue_y', 'BrodmannArea', 'sampleStatus', 'tissueWeight', 'tissueVolume', 'nucleicAcidSource_y', 'cellType', 'fastingState', 'isPostMortem', 'samplingAge', 'samplingAgeUnits', 'visitNumber', 'assay_y', 'individualID', 'dataset', 'individualIdSource', 'species_y', 'sex_y', 'race', 'ethnicity', 'yearsEducation', 'ageDeath',

In [7]:
# convert nested list of anndata object into Rpy2 object 
subject_id =  robjects.ListVector(
                        {
                            save_prefix: subject_id[save_prefix]
                     
                            for save_prefix in studies
                        }
                    )

In [8]:
%%R -i adata_annot -i subject_id -i gene_celltype_threshold -i celltypes -i test_names -i studies -i gene_selection

print(adata_annot)
print('loaded data into memory for recursive use')

class: SingleCellExperiment 
dim: 18168 114140 
metadata(0):
assays(1): X
rownames(18168): A1BG A1CF ... ZZEF1 ZZZ3
rowData names(0):
colnames(114140): AAAGGTAGTACGGCAA AACCAACGTGGATCGA ...
  EC10_TTTGTCATCTATCGCC EC10_TTTGTCATCTCTGCTG
colData names(119): ID name ... doublet_score predicted_doublet
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
[1] "loaded data into memory for recursive use"


In [9]:
del adata_annot

## **Consensus network analysis of celltypes in AD**

Here, we perform consensus network analysis between all three datastes in AD. We will also follow the “standard” workflow (not consensus), and compare the resulting gene module assignments.

First, we setup the seurat object for hdWGCNA and we construct metacells. This part of the workflow is the same as the standard hdWGCNA workflow, but we must include the `relevant metadata for consensus network analysis when running MetacellsByGroups`. 

The loaded Seurat object has a metadata column called `pathology.group`, which contains a `early` or `late` AD identifiers depending on the pathological stage of the donor where a given cell originated from, so we must include  in our group.by list within MetacellsByGroups in order to run consensus network analysis.



## **Set up Seurat object for WGCNA**

For our specific case, we will either select genes that are expressed in at least 5% of cells within each cell type or specify a list of custom genes. Additionally, we will rename our hdWGCNA experiment to enhance clarity.

Here, the `gene_selection` parameter is set to `custom`, we employ the genes present in all the overlapping pathways in a cell type-specific and study-specific manner.

To accomplish this, we load the list of overlapping pathways that was previously saved in `../results/{test_name}/{test_name}_pathway_overlap.csv` from the `pathway_meta_analysis.ipynb` notebook. Additionally, we load the genes from the `.gmt` file saved in `../results/gsva/{study}/{cur_celltype}/gs.gmt` from the `pathway_analysis.ipynb` notebook.

##### **Gene Set Selection**

To obtain the gene sets required for the `SetupForWGCNA` function, we consider the `gene_set_select` variable when `gene_select` is set to `custom`. The `gene_set_select` variable determines how the `gene_list` passed into the `SetupForWGCNA` function will be obtained. 

There are three options available:

- `diff_exp`: This option utilizes the genes present in the differentially expressed pathways, previously computed and saved to the `../results/{test_name}/{save_prefix}_differentially_expressed_pathways.csv` file.

- `overlap`: Here, the genes used are those expressed in pathways that overlap across different studies. The pathway overlap information is retrieved from the `../results/{test_name}/{test_name}_pathway_overlap.csv` file.

- `all`: With this option, all genes in the pathways stored in the `../results/gsva/{save_prefix}/{celltype}/gs.gmt` file are included.

By specifying the appropriate `gene_select` and `gene_set_select` values, you can customize the gene selection process for your hdWGCNA experiment.


In [10]:
%%R -o geneSets -i gene_set_select

# iterate over celltypes and obtain 
geneSets <- list()
for(test_name in test_names){

  geneSets[[test_name]] <- list()
  for (cur_celltype in celltypes){

    geneSets[[test_name]][[cur_celltype]] <- list()
    for (study in studies){
        # get gene sets
        geneSets[[test_name]][[cur_celltype]][[study]] <- getGmt(file.path('../results/gsva/', study, '/', cur_celltype, 'gs.gmt'))
        geneSets[[test_name]][[cur_celltype]][[study]] <- geneIds(geneSets[[test_name]][[cur_celltype]][[study]])

        if (gene_set_select=='overlap'){
          pathways <- read.csv(paste0('../results/',test_name, '/', test_name, '_pathway_overlap.csv'))
          pathways <- pathways$pathway
        }else if (gene_set_select=='diff_exp'){
          pathways <- read.csv(paste0('../results/', test_name, '/', study, '_differentially_expressed_pathways.csv'))
          pathways <- subset(pathways, P.Value < 0.05 & celltype == cur_celltype)$pathway
        }else if (gene_set_select=='all'){
          pathways <- names(geneSets[[test_name]][[cur_celltype]][[study]])
        }
        geneSets[[test_name]][[cur_celltype]][[study]] <-  geneSets[[test_name]][[cur_celltype]][[study]][names(geneSets[[test_name]][[cur_celltype]][[study]]) %in% pathways]
        geneSets[[test_name]][[cur_celltype]][[study]] <- unlist(geneSets[[test_name]][[cur_celltype]][[study]], recursive = FALSE)
    }
    geneSets[[test_name]][[cur_celltype]] <- unique(unlist(geneSets[[test_name]][[cur_celltype]], recursive = FALSE))
  }
}


In [11]:
# convert nested list of geneSets object into Rpy2 object 

geneSets =  robjects.ListVector(
                        {
                            test_name: robjects.ListVector(
                                    {cell_type: geneSets[test_name][cell_type]
                                                 
                                    for cell_type in celltypes
                                    }
                                )
                            for test_name in test_names
                        }

                        )

In [12]:
%%R -o seurat_obj -i geneSets

seurat_obj <- as.Seurat(adata_annot, counts = "X", data = "X")

print(seurat_obj)

# Perform dimensionality reduction and plot

seurat_obj <- FindVariableFeatures(seurat_obj)
seurat_obj <- ScaleData(seurat_obj)
seurat_obj <- RunPCA(seurat_obj)
seurat_obj <- RunHarmony(seurat_obj, group.by.vars = "study")
seurat_obj <- RunUMAP(seurat_obj, reduction='harmony', n.neighbors=15, dims=1:30, min.dist=0.1)

p <- DimPlot(seurat_obj, group.by = "cell_type", label = TRUE) +
     umap_theme() + ggtitle("all_studies") + NoLegend()


fig_dir = paste0("../results/hdWGCNA/UMAP/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

# Save plot to PDF
pdf(file = paste0(fig_dir, "Integrated_umap_all_studies.pdf"), width = 4, height = 4, useDingbats = FALSE)
print(p)
dev.off()


# create a hdWGCNA experiment for each celltype
seurat_dat <- seurat_obj
seurat_obj <- list()

for (cur_cell_type in celltypes){

  seurat_obj[[cur_cell_type]] <- subset(seurat_dat, cell_type == cur_cell_type)

  print(paste0('Creating hdWGNA Experiment for ', toupper(cur_cell_type)))

  if (gene_selection == 'custom'){
    seurat_obj[[cur_cell_type]] <- SetupForWGCNA(
      seurat_obj[[cur_cell_type]],
      gene_select = "custom",                                 # the gene selection approach
      gene_list = as.vector(geneSets[['ad_vs_no']][[cur_cell_type]]),    # list of genes to be included
      group.by = 'study',                                 # grouping parameter
      wgcna_name = paste0(toupper(substr(cur_cell_type, 1, 3)), '_consensus')       # the name of the hdWGCNA experiment
    )
  } else {
    seurat_obj[[cur_cell_type]] <- SetupForWGCNA(
      seurat_obj[[cur_cell_type]],
      gene_select = "fraction",                                                 # the gene selection approach
      fraction = 0.05,                                                          # fraction of cells for gene inclusion
      group.by = 'study',                                                       # grouping parameter
      wgcna_name = paste0(toupper(substr(cur_cell_type, 1, 3)), '_consensus')       # the name of the hdWGCNA experiment
    )
  }
  print(seurat_obj[[cur_cell_type]])
  print(paste0(length(GetWGCNAGenes(seurat_obj[[cur_cell_type]])), " WGCNA Genes"))

}

rm(seurat_dat)
rm(adata_annot)

An object of class Seurat 
18168 features across 114140 samples within 1 assay 
Active assay: originalexp (18168 features, 0 variable features)
[1] "Creating hdWGNA Experiment for INHIBITORY"
An object of class Seurat 
18168 features across 18139 samples within 1 assay 
Active assay: originalexp (18168 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap
[1] "4547 WGCNA Genes"


Send list to R interface

In [13]:
# convert nested list of Seurat object into Rpy2 object 

seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in celltypes
                        }
                    )

## **Construct metacells**

After setting up our Seurat object, we construct metacells from the single-cell dataset.

First, we setup the seurat object for hdWGCNA and we construct metacells. This part of the workflow is the same as the standard hdWGCNA workflow, but we must include the `relevant metadata for consensus network analysis when running MetacellsByGroups`. 

The loaded Seurat object has a metadata column called `pathology.group`, which contains a `early` or `late` AD identifiers depending on the pathological stage of the donor where a given cell originated from, so we must include  in our group.by list within MetacellsByGroups in order to run consensus network analysis.

In [14]:
%%R -i seurat_obj -o seurat_obj

# construct metacells in each group for each wgnca experiment

for (cell_type in celltypes){

  print(paste0('Constructing MetaCells in hdWGCNA Experiment for ', toupper(cell_type)))
  
  seurat_obj[[cell_type]] <- MetacellsByGroups(
      
    seurat_obj = seurat_obj[[cell_type]],
    group.by = c("pathology.group", "study", "cell_type"), # specify the columns in seurat_obj@meta.data to group by
    reduction = 'harmony',  # select the dimensionality reduction to perform KNN on
    k = 25, # nearest-neighbors parameter
    max_shared = 10, # maximum number of shared cells between two metacells
    ident.group = 'cell_type', # set the Idents of the metacell seurat object
    wgcna_name = paste0(toupper(substr(cell_type, 1, 3)), '_consensus')        # the name of the hdWGCNA experiment
    )

    # normalize metacell expression matrix:
  seurat_obj[[cell_type]] <- NormalizeMetacells(seurat_obj[[cell_type]],  
                            wgcna_name = paste0(toupper(substr(cell_type, 1, 3)), '_consensus'))
}


[1] "Constructing MetaCells in hdWGCNA Experiment for INHIBITORY"


In [15]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in celltypes
                        }
                    )

## **Co-expression network analysis**

Next we have to set up the expression dataset for consensus network analysis. In the standard workflow, we use the function `SetDatExpr` to set up the expression matrix that will be used for network analyis. Instead of `SetDatExpr`, here we use `SetMultiExpr` to set up a separate expression matrix for each `study`. This allows us to perform network analysis individually on these expression matrices.

In [16]:
%%R -i seurat_obj -o seurat_obj

# set up a gene expression matrix list for each study
# construct metacells in each group for each wgnca experiment

for (cell_type in celltypes){

  print(paste0('Setting up expression matrix in hdWGCNA Experiment for ', toupper(cell_type)))
  
  seurat_obj[[cell_type]] <- SetMultiExpr(
      seurat_obj[[cell_type]],
      group_name = cell_type, # the name of the group of interest in the group.by column
      group.by = 'cell_type', # the metadata column containing the cell type info. This same column should have also been used in MetacellsByGroups  
      assay = 'originalexp', # using RNA assay
      slot = 'data', # using normalized data
      wgcna_name = paste0(toupper(substr(cell_type, 1, 3)), '_consensus'), # the name of the hdWGCNA experiment
      multi.group.by = "study",
      multi_groups = NULL # this parameter can be used to select a subset of groups in the multi.group.by column
    )
}


[1] "Setting up expression matrix in hdWGCNA Experiment for INHIBITORY"
  ..Excluding 164 genes from the calculation due to too many missing samples or zero variance.


In [17]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in celltypes
                        }
                    )

### **Select soft-power threshold**

Now that we have expression matrices for all studies, we identify an appropriate soft power threshold for each of these matrices separately. Instead of the `TestSoftPowers` function which we use in the standard hdWGCNA workflow, `we use TestSoftPowersConsensus`, which will perform the test for each of the expression matrices. When we plot the results with  `PlotSoftPowers`, we get a nested list of plots for each dataset, and we can assemble the plots using patchwork.


In [18]:
%%R -i seurat_obj -o seurat_obj

fig_dir = paste0("../results/hdWGCNA/SoftPower/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in celltypes){
  
  print(paste0('Estinmating Soft-Power Threshold in hdWGCNA Experiment for ', toupper(cell_type)))

  # Test different soft powers:
  seurat_obj[[cell_type]] <- TestSoftPowersConsensus(
    seurat_obj[[cell_type]],
    group.by = 'cell_type',
    group_name = cell_type,
  )

  # plot the results:
  plot_list <- PlotSoftPowers(seurat_obj[[cell_type]], 
                              # wgcna_name = paste0(toupper(substr(cell_type, 1, 3)), '_consensus') # the name of the hdWGCNA experiment
                              )

  # get just the scale-free topology fit plot for each group
  consensus_groups <- unique(seurat_obj[[cell_type]]$study)

  p_list <- lapply(1:length(consensus_groups), function(i){
    cur_group <- consensus_groups[[i]]
    plot_list[[i]][[1]] + ggtitle(paste0('Study: ', cur_group)) + theme(plot.title=element_text(hjust=0.5))
  })

  pdf(paste0(fig_dir, cell_type, '_consensus_SoftPower.pdf'), width=10, height=8)
  print(wrap_plots(p_list, ncol=length(studies)))
  dev.off()
}


[1] "Estinmating Soft-Power Threshold in hdWGCNA Experiment for INHIBITORY"
[1] "allen_mtg"
pickSoftThreshold: will use block size 4383.
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 4383 of 4383
   Power SFT.R.sq slope truncated.R.sq  mean.k. median.k.  max.k.
1      1  0.17600 26.10          0.881 2.21e+03  2.21e+03 2300.00
2      2  0.09080  9.87          0.952 1.13e+03  1.13e+03 1240.00
3      3  0.00128 -0.71          0.860 5.82e+02  5.78e+02  697.00
4      4  0.30300 -7.31          0.766 3.04e+02  2.98e+02  413.00
5      5  0.78700 -9.32          0.926 1.60e+02  1.55e+02  256.00
6      6  0.92200 -7.85          0.974 8.53e+01  8.10e+01  165.00
7      7  0.93800 -6.22          0.972 4.62e+01  4.26e+01  110.00
8      8  0.94600 -4.92          0.970 2.54e+01  2.25e+01   76.60
9      9  0.96400 -3.97          0.979 1.42e+01  1.19e+01   55.10
10    10  0.95200 -3.40          0.968 8.13e+00  6.38e+00   40.80
11    12  0.97500 -2.56    

In [19]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in celltypes
                        }
                    )

The general guidance for WGCNA and hdWGCNA is to `pick the lowest soft power threshold that has a Scale Free Topology Model Fit greater than or equal to 0.8`. Later on, the `ConstructNetwork `will automatically select the soft power threshold if we do not provide one.

The output table from the parameter sweep is stored in the hdWGCNA experiment and can be accessed using the GetPowerTable function for further inspection:

In [20]:
# %%R -i seurat_obj -o seurat_obj

# power_table <- GetPowerTable(seurat_obj)
# head(power_table)

### **Construct co-expression network**

Now we construct the co-expression network and identify gene modules using the `ConstructNetwork` function, making sure to specify `consensus=TRUE`. 

Indicating `consensus=TRUE tells hdWGCNA to construct a separate network for each expression matrix`, followed by integrating the networks and identifying gene modules. Depending on the results of `TestSoftPowersConsensus`, we can supply a different soft power threshold for each dataset.

In [21]:
soft_powers = input('Look at the file saved to "../results/hdWGCNA/SoftPower/Consensus/Consensus_SoftPower" \
                    and input the softpowers for each study starting from left to right, separated by comma with no space: ').strip(" ").split(',')

soft_powers = [int(i) for i in soft_powers]

In [22]:
%%R -i seurat_obj -o seurat_obj -i soft_powers


for (cell_type in celltypes) {
  
  print(paste0('Constructing co-expression network in hdWGCNA Experiment for ', toupper(cell_type)))
  
  tryCatch({
    # construct co-expression network:
    seurat_obj[[cell_type]] <- ConstructNetwork(
      seurat_obj[[cell_type]], 
      soft_power = as.vector(soft_powers), # Set to NULL so that the ConstructNetwork function obtains it automatically (Scale Free Topology Model Fit >= 0.8)
      setDatExpr = FALSE,
      overwrite_tom = TRUE,
      consensus = TRUE,
      tom_name = paste0(toupper(substr(cell_type, 1, 3)), '_consensus') # name of the topological overlap matrix written to disk set as the name of the hdWGCNA experiment 
    )
  }, error = function(e) {
    print(paste0('Error encountered while processing ', toupper(cell_type)))
    print(paste0(toupper(cell_type), ' dropped from experiment'))
    # Code to handle the error condition
    seurat_obj <- seurat_obj[names(seurat_obj) != cell_type]
    assign("seurat_obj", seurat_obj, envir = .GlobalEnv) # Update the modified seurat_obj in the global environment

  },  
  message = function(m) {
    print(paste0('Error encountered while processing ', toupper(cell_type)))
    print(paste0(toupper(cell_type), ' dropped from experiment'))
    # Code to handle the error condition
    seurat_obj <- seurat_obj[names(seurat_obj) != cell_type]
    assign("seurat_obj", seurat_obj, envir = .GlobalEnv) # Update the modified seurat_obj in the global environment

  }) 
  
}

print(names(seurat_obj))


[1] "Constructing co-expression network in hdWGCNA Experiment for INHIBITORY"
 Calculating consensus modules and module eigengenes block-wise from all genes
 Calculating topological overlaps block-wise from all genes
   Flagging genes and samples with too many missing values...
    ..step 1
    TOM calculation: adjacency..
    ..will not use multithreading.
     Fraction of slow calculations: 0.000000
    ..connectivity..
    ..matrix multiplication (system BLAS)..
    ..normalization..
    ..done.
    TOM calculation: adjacency..
    ..will not use multithreading.
     Fraction of slow calculations: 0.000000
    ..connectivity..
    ..matrix multiplication (system BLAS)..
    ..normalization..
    ..done.
    TOM calculation: adjacency..
    ..will not use multithreading.
     Fraction of slow calculations: 0.000000
    ..connectivity..
    ..matrix multiplication (system BLAS)..
    ..normalization..
    ..done.
 ..Working on block 1 .
 ..Working on block 1 .
 ..merging consensus mod

In [23]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

hdWGCNA also includes a function PlotDendrogram to visualize the WGCNA dendrogram, a common visualization to show the different co-expression modules resulting from the network analysis. Each leaf on the dendrogram represents a single gene, and the color at the bottom indicates the co-expression module assignment.


`Importantly, the “grey” module consists of genes that were not grouped into any co-expression module. The grey module should be ignored for all downstream analysis and interpretation.`

In [24]:
%%R -i seurat_obj -o seurat_obj

fig_dir = paste0("../results/hdWGCNA/dendrogram/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

tryCatch({
    for (cell_type in names(seurat_obj)){
        # print(paste0('Constructing co-expression networkd in hdWGCNA Experiment for ', toupper(cell_type)))
        pdf(paste0(fig_dir, cell_type, "_consensus_dendrogram.pdf"), height=2, width=4)
        PlotDendrogram(seurat_obj[[cell_type]], main='hdWGCNA Dendrogram')
        dev.off()
    }
}, error = function(e){
    NULL
})

In [25]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

### **Optional: Compare Consensus network results to the standard hdWGCNA workflow**

### **Standard hdWGCNA workflow**


In [26]:
%%R -i seurat_obj -o seurat_obj 

fig_dir = paste0("../results/hdWGCNA/dendrogram/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in celltypes) {
  
  print(paste0('Running standard hdWGCNA workflow in hdWGCNA Experiment for ', toupper(cell_type)))
  
  # setup new hdWGCNA experiment
  seurat_obj[[cell_type]] <- SetupForWGCNA(
    seurat_obj[[cell_type]],
    gene_select = "custom",
    gene_list = as.vector(geneSets[['ad_vs_no']][[cell_type]]),
    wgcna_name = paste0(toupper(substr(cell_type, 1, 3)), '_standard'),
    metacell_location = paste0(toupper(substr(cell_type, 1, 3)), '_consensus') # use the same metacells
  )
  seurat_obj[[cell_type]] <- NormalizeMetacells(seurat_obj[[cell_type]])

  # construct network
  seurat_obj[[cell_type]] <- SetDatExpr(seurat_obj[[cell_type]], group_name = cell_type, group.by = "cell_type")
  seurat_obj[[cell_type]] <- TestSoftPowers(seurat_obj[[cell_type]])
  seurat_obj[[cell_type]] <- ConstructNetwork(seurat_obj[[cell_type]], soft_power=NULL, overwrite_tom = TRUE,
                              tom_name = paste0(toupper(substr(cell_type, 1, 3)), '_standard'))

  # plot the dendrogram
  pdf(paste0(fig_dir, cell_type, "_standard_dendrogram.pdf"), height=2, width=4)
  print(PlotDendrogram(seurat_obj[[cell_type]], main='ASC standard Dendrogram'))
  dev.off()

}

[1] "Running standard hdWGCNA workflow in hdWGCNA Experiment for INHIBITORY"
pickSoftThreshold: will use block size 4547.
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 4547 of 4547
   Power SFT.R.sq slope truncated.R.sq  mean.k. median.k. max.k.
1      1    0.308 26.20          0.963 2310.000  2.31e+03 2410.0
2      2    0.121 -7.39          0.858 1190.000  1.19e+03 1380.0
3      3    0.640 -9.25          0.775  626.000  6.17e+02  839.0
4      4    0.872 -6.98          0.886  334.000  3.24e+02  543.0
5      5    0.946 -5.37          0.936  182.000  1.71e+02  370.0
6      6    0.961 -4.16          0.949  101.000  9.14e+01  265.0
7      7    0.953 -3.36          0.941   57.400  4.92e+01  198.0
8      8    0.949 -2.81          0.936   33.600  2.66e+01  153.0
9      9    0.938 -2.42          0.923   20.300  1.46e+01  122.0
10    10    0.930 -2.14          0.911   12.700  8.01e+00   99.9
11    12    0.885 -1.81          0.855    5.620  2.47

In [27]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

We can plot the gene module assignments from the standard workflow under those from the consensus analysis in the same dendrogram plot as a rough comparison.


In [28]:
%%R -i seurat_obj -o seurat_obj 

fig_dir = paste0("../results/hdWGCNA/dendrogram/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in celltypes) {
  
  print(paste0('Comparing module assignments in hdWGCNA Experiment for ', toupper(cell_type)))
  
  # get both sets of modules
  modules <- GetModules(seurat_obj[[cell_type]], wgcna_name=paste0(toupper(substr(cell_type, 1, 3)), '_standard'))
  consensus_modules <- GetModules(seurat_obj[[cell_type]], wgcna_name=paste0(toupper(substr(cell_type, 1, 3)), '_consensus'))

  # get consensus dendrogram
  net <- GetNetworkData(seurat_obj[[cell_type]], wgcna_name=paste0(toupper(substr(cell_type, 1, 3)), '_consensus'))
  dendro <- net$dendrograms[[1]]

  # get the gene and module color for consensus
  consensus_genes <- consensus_modules$gene_name
  consensus_colors <- consensus_modules$color
  names(consensus_colors) <- consensus_genes

  # get the gene and module color for standard
  genes <- modules$gene_name
  colors <- modules$color
  names(colors) <- genes

  # re-order the genes to match the consensus genes
  colors <- colors[consensus_genes]

  # set up dataframe for plotting
  color_df <- data.frame(
    consensus = consensus_colors,
    standard = colors
  )


  pdf(paste0(fig_dir, cell_type, "_consensus_&_standard_dendrogram.pdf"), height=2, width=4)
  # plot dendrogram using WGCNA function
  WGCNA::plotDendroAndColors(
    net$dendrograms[[1]],
    color_df,
    groupLabels=colnames(color_df),
    dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05,
    main = "AD consensus dendrogram",
  )
  dev.off()

}

[1] "Comparing module assignments in hdWGCNA Experiment for INHIBITORY"


In [29]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

It is clear to see that much of the co-expression structure is missing when using the standard workflow when the dataset contains vast differences coming from the individual studies. We can take the consensus network and perform any of the downstream hdWGCNA analysis steps.


## **Module Eigengenes and Connectivity**

### **Compute harmonized module eigengenes**

`Module Eigengenes (MEs)` are a commonly used metric to summarize the gene expression profile of an entire co-expression module. Briefly, module eigengenes are computed by performing principal component analysis (PCA) on the subset of the gene expression matrix comprising each module. `The first PC of each of these PCA matrices are the MEs.`

Dimensionality reduction techniques are a very hot topic in single-cell genomics. It is well known that technical artifacts can muddy the analysis of single-cell datasets, and over the years there have been many methods that aim to reduce the effects of these artifacts. Therefore it stands to reason that `MEs would be subject to these technical artifacts as well`, and hdWGCNA seeks to alleviate these effects.

`hdWGCNA includes a function ModuleEigengenes` to compute module eigengenes in single cells. Additionally, we can apply Harmony batch correction to the MEs, yielding harmonized module eigengenes (hMEs).

The following code performs the module eigengene computation `harmonizing by the Sample of origin using the group.by.vars parameter.`

### **Compute module connectivity**

In co-expression network analysis, we often want to focus on the `“hub genes”`,`those which are highly connected within each module.` Therefore we wish to determine the `eigengene-based connectivity, also known as kME`, of each gene. 

hdWGCNA includes the ModuleConnectivity to compute the kME values in the full single-cell dataset, rather than the metacell dataset. This function essentially computes pairwise correlations between genes and module eigengenes. kME can be computed for all cells in the dataset, but we recommend computing kME in the cell type or group that was previously used to run ConstructNetwork.

In [30]:
%%R -i seurat_obj -o seurat_obj

for (cell_type in names(seurat_obj)){

    # change active hdWGCNA experiment to consensus
    
    seurat_obj[[cell_type]] <- SetActiveWGCNA(seurat_obj[[cell_type]],
                             wgcna_name=paste0(toupper(substr(cell_type, 1, 3)), '_consensus'))

    print(paste0('Estimating module eigen-genes in hdWGCNA Experiment for ', toupper(cell_type)))
    
    # need to run ScaleData first or else harmony throws an error:
    # seurat_obj[[cell_type]] <- ScaleData(seurat_obj[[cell_type]], features=VariableFeatures(seurat_obj[[cell_type]]))

    # compute all MEs in the full single-cell dataset
    seurat_obj[[cell_type]] <- ModuleEigengenes(
    seurat_obj[[cell_type]],
    group.by.vars='study'
    )

    print(paste0('Estimating module connectivity in hdWGCNA Experiment for ', toupper(cell_type)))

    # compute eigengene-based connectivity (kME):
    seurat_obj[[cell_type]] <- ModuleConnectivity(
        seurat_obj[[cell_type]],
        group.by = 'cell_type', 
        group_name = cell_type
    )
    
    print('----------------')

}

[1] "Estimating module eigen-genes in hdWGCNA Experiment for INHIBITORY"
[1] "yellow"
[1] "grey"
[1] "blue"
[1] "turquoise"
[1] "brown"
[1] "Estimating module connectivity in hdWGCNA Experiment for INHIBITORY"
[1] "----------------"


In [31]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

For convenience, we re-name the hdWGCNA modules to indicate that they are from the Excitatory neuron group.

In [32]:
%%R -i seurat_obj -o seurat_obj

for (cell_type in names(seurat_obj)){

  print(paste0('Renaming modules in hdWGCNA Experiment for ', toupper(cell_type)))

  # rename the modules
  seurat_obj[[cell_type]] <- ResetModuleNames(
    seurat_obj[[cell_type]],
    new_name = paste0(toupper(substr(cell_type, 1, 3)), "-M"),
    wgcna_name = paste0(toupper(substr(cell_type, 1, 3)), '_consensus') # the name of the hdWGCNA experiment,
  )
}

[1] "Renaming modules in hdWGCNA Experiment for INHIBITORY"


In [33]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

We also reset module colors

In [34]:
%%R -i seurat_obj -o seurat_obj 



library(MetBrewer)

for (cell_type in names(seurat_obj)){

  print(paste0('Resetting module colors in hdWGCNA Experiment for ', toupper(cell_type)))


  modules <- GetModules(seurat_obj[[cell_type]])
  mods <- levels(modules$module)
  mod_colors <- dplyr::select(modules, c(module, color)) %>%
    distinct %>% arrange(module) %>% .$color
  n_colors <- length(mod_colors) -1

  new_colors <- paste0(met.brewer("Signac", n=n_colors,))
  new_colors <- sample(new_colors)
  seurat_obj[[cell_type]] <- ResetModuleColors(seurat_obj[[cell_type]], new_colors)

}


[1] "Resetting module colors in hdWGCNA Experiment for INHIBITORY"


In [35]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

We can visualize the genes in each module ranked by kME using the PlotKMEs function.

In [36]:
%%R -i seurat_obj -o seurat_obj

library(conflicted)
conflicts_prefer(dplyr::select)

fig_dir = paste0("../results/hdWGCNA/ModuleConnectivity/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){
    
    print(paste0('Visualize Module Connectivity (kMEs) in hdWGCNA Experiment for ', toupper(cell_type)))

    # plot genes ranked by kME for each module

    p <- PlotKMEs(seurat_obj[[cell_type]], ncol=5)

    pdf(paste0(fig_dir, cell_type, 'consensus_KMEs.pdf'), width=12, height=8)
    print(p)
    dev.off()
}

[conflicted] Will prefer dplyr::select over any other package.
[1] "Visualize Module Connectivity (kMEs) in hdWGCNA Experiment for INHIBITORY"
[1] "INH-M1"
[1] "INH-M2"
[1] "INH-M3"
[1] "INH-M4"
[1] "INH-M1"
[1] "INH-M2"
[1] "INH-M3"
[1] "INH-M4"


In [37]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Compute hub gene signature scores**

Gene scoring analysis is a popular method in single-cell transcriptomics for computing a score for the overall signature of a set of genes. Seurat implements their own gene scoring technique using the `AddModuleScore` function, but there are also alternative approaches such as UCell.

hdWGCNA includes the function `ModuleExprScore` to compute gene scores for a give number of genes for each module, using either the Seurat or UCell algorithm. Gene scoring is an alternative way of summarizing the expression of a module from computing the module eigengene.

In [38]:
%%R -i seurat_obj -o seurat_obj

for (cell_type in names(seurat_obj)){
    
  print(paste0('Compute hub gene scores in hdWGCNA Experiment for ', toupper(cell_type)))
    
  # compute gene scoring for the top 25 hub genes by kME for each module
  # with Seurat method
  seurat_obj[[cell_type]] <- ModuleExprScore(
    seurat_obj[[cell_type]],
    n_genes = 25,
    method='Seurat'
  )
  
  # compute gene scoring for the top 25 hub genes by kME for each module
  # with UCell method
  library(UCell)
  seurat_obj[[cell_type]] <- ModuleExprScore(
    seurat_obj[[cell_type]],
    n_genes = 25,
    method='UCell'
  )
}

[1] "Compute hub gene scores in hdWGCNA Experiment for INHIBITORY"
Selecting by kME_INH-M1
Selecting by kME_INH-M2
Selecting by kME_INH-M3
Selecting by kME_INH-M4
Selecting by kME_INH-M1
Selecting by kME_INH-M2
Selecting by kME_INH-M3
Selecting by kME_INH-M4


In [39]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Basic Visualization**

Here we visualize some results using hdWGCNA, and we employ some of Seurat’s built-in plotting tools to visualize our hdWGCNA results.



### **Module Feature Plots**

FeaturePlot is a commonly used Seurat visualization to show a feature of interest directly on the dimensionality reduction. hdWGCNA includes the ModuleFeaturePlot function to consruct FeaturePlots for each co-expression module colored by each module’s uniquely assigned color.

In [40]:
%%R -i seurat_obj -o seurat_obj


fig_dir = paste0("../results/hdWGCNA/FeaturePlots/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}


for (cell_type in names(seurat_obj)){
    
  print(paste0('Module Feature Plots in hdWGCNA Experiment for ', toupper(cell_type)))
     
  # make a featureplot of hMEs for each module
  plot_list <- ModuleFeaturePlot(
    seurat_obj[[cell_type]],
    features='hMEs', # plot the hMEs
    order=TRUE, # order so the points with highest hMEs are on top
    # raster=TRUE, 
    # raster_dpi=400, 
    # alpha=1,
    # restrict_range=FALSE, 
    # raster_scale=0.25,
  )

  plot_list <- lapply(1:length(plot_list), function(x){
    plot_list[[x]] + NoLegend() + theme(plot.title=element_text(face='plain', vjust=0.25), plot.margin=margin(c(0,0,0,0)))
  })

  pdf(paste0(fig_dir, cell_type, "_consensus_featureplot_MEs.pdf"), height=10, width=5)
  print(wrap_plots(plot_list, ncol=3))
  dev.off()
}

[1] "Module Feature Plots in hdWGCNA Experiment for INHIBITORY"
[1] "INH-M1"
[1] "INH-M2"
[1] "INH-M3"
[1] "INH-M4"


In [41]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

We can also plot the hub gene signature score using the same function:

In [42]:
%%R -i seurat_obj -o seurat_obj


fig_dir = paste0("../results/hdWGCNA/FeaturePlots/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}


for (cell_type in names(seurat_obj)){
    
  print(paste0('Module Feature Plots for HUB GENES in hdWGCNA Experiment for ', toupper(cell_type)))
     
  # make a featureplot of hMEs for each module
  plot_list <- ModuleFeaturePlot(
    seurat_obj[[cell_type]],
    features='scores',  # plot the hub gene scores
    order='shuffle', # order so cells are shuffled
    ucell = TRUE, # depending on Seurat vs UCell for gene scoring
    # raster=TRUE, 
    # raster_dpi=400, 
    # alpha=1,
    # restrict_range=FALSE, 
    # raster_scale=0.25,
  )

  plot_list <- lapply(1:length(plot_list), function(x){
    plot_list[[x]] + NoLegend() + theme(plot.title=element_text(face='plain', vjust=0.25), plot.margin=margin(c(0,0,0,0)))
  })

  pdf(paste0(fig_dir, cell_type, "_consensus_featureplot_hubs.pdf"), height=10, width=5)
  print(wrap_plots(plot_list, ncol=3))
  dev.off()
}

[1] "Module Feature Plots for HUB GENES in hdWGCNA Experiment for INHIBITORY"
[1] "INH-M1"
[1] "INH-M2"
[1] "INH-M3"
[1] "INH-M4"


In [43]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Overlap With AD Genes**

Next on the co-expression UMAP, we show&quantify the overlap between hub genes that have a known association with AD using the AD databases obtained from the [**KEGG Alzheimer's pathway**](https://www.genome.jp/pathway/hsa05010) and [**Harmonizome (Mayaanlab)**](https://maayanlab.cloud/Harmonizome/gene_set/Alzheimer+Disease/dbGAP+Gene-Trait+Associations) 

In [44]:
KEGG_paths = pathway_analyses.read_pathways('../data/pathway_databases/KEGG_2019_Human.txt')
KEGG_genes = [gene for gene in list(KEGG_paths[(KEGG_paths[0].str.startswith('Alzheimer disease'))].iloc[:, 1:].values[0]) if str(gene)!="nan"]
mayaanlab_genes = pd.read_csv('../data/pathway_databases/AD_genes.csv').Symbol.to_list()
KEGG_genes.extend(mayaanlab_genes)

AD_genes = list(set(KEGG_genes))

In [68]:
%%R -i seurat_obj -o seurat_obj -i AD_genes

# load modules

fig_dir = paste0("../results/hdWGCNA/GeneOverlap/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}


for (cell_type in names(seurat_obj)){

  print(paste0('Estimating AD gene overlap in hdWGCNA Experiment for ', toupper(cell_type)))

  modules <- GetModules(seurat_obj[[cell_type]])
  mods <- levels(modules$module)
  genome.size <- dim(seurat_obj[[cell_type]])[1]


  overlap_df <- do.call(rbind, lapply(mods, function(cur_mod){

  cur_genes <- modules %>% subset(module == cur_mod) %>% .$gene_name

  cur_overlap <- testGeneOverlap(newGeneOverlap(
      cur_genes,
      as.vector(AD_genes),
      genome.size=genome.size
  ))

  cur_overlap <- data.frame(
    'odds.ratio' = cur_overlap@odds.ratio,
    'pval' = cur_overlap@pval,
    'Jaccard' = cur_overlap@Jaccard,
    'size_intersection' = length(cur_overlap@intersection),
    'module' = cur_mod
  )

    cur_overlap

  })) %>% as.data.frame()

  overlap_df <- overlap_df %>% mutate(fdr=p.adjust(pval, method='fdr'))
  overlap_df <- overlap_df %>% subset(module != 'grey')

  ################################################################################
  # Plot as a lollipop
  ################################################################################

  overlap_df$shape <- ifelse(overlap_df$fdr < 0.05, 21, 4)
  overlap_df <- overlap_df %>% arrange(odds.ratio, descending=TRUE)
  overlap_df$module <- factor(as.character(overlap_df$module), levels=as.character(overlap_df$module))

  mod_colors <- dplyr::select(modules, c(module, color)) %>%
    distinct
  cp <- mod_colors$color; names(cp) <- mod_colors$module

  p <- overlap_df %>%
    ggplot(aes(y=module, x=odds.ratio, size= size_intersection, color=module)) +
    geom_segment(aes(y=module, yend=module, x=0, xend=odds.ratio), size=0.5, color='grey') +
    geom_point() +
    geom_point(shape=overlap_df$shape, color='black', fill=NA) +
    scale_color_manual(values=cp, guide='none') +
    ylab('') + xlab("Odds ratio") +
    scale_x_continuous(breaks = seq(0, 19, 2)) +
    labs(size='Size\nintersection') +
    ggtitle('Overlap with AD genes') +

    theme(
      panel.border = element_rect(size=1, color='black', fill=NA),
      axis.line.y = element_blank(),
      axis.line.x = element_blank(),
      plot.title = element_text(hjust=0.5, face='plain')
    )

  pdf(paste0(fig_dir, cell_type, '_consensus_AD_overlap.pdf'), width=4, height=3.5)
  print(p)
  dev.off()
}


[1] "Estimating AD gene overlap in hdWGCNA Experiment for INHIBITORY"
[1] 18168


In [69]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Overlap With DEGs**

We quantify the overlap between hub genes and DEGs obtained from previous analysis and saved at `../results/{test_name}/{save_prefix}_{deg_method}_degs.xlsx`.

In [70]:
%%R -i seurat_obj -o seurat_obj -i deg_method

# load modules

fig_dir = paste0("../results/hdWGCNA/GeneOverlap/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}


for (test in c('ad_vs_no')){
  
  for (cell_type in names(seurat_obj)){

    print(paste0('Estimating AD gene overlap in hdWGCNA Experiment for ', toupper(cell_type)))

    degs <- list()

    for (study in studies){
      wgcna_genes <- GetWGCNAGenes(seurat_obj[[cell_type]])
      hub_genes <- GetHubGenes(seurat_obj[[cell_type]], 25)
      degs[[study]] <- read_excel(paste0('../results/', test, '/', study, '_', deg_method, '_degs.xlsx'), sheet = cell_type)
      degs[[study]] <- intersect(hub_genes$gene_name, subset(degs, p_val_adj<0.01 & abs_logFC>0.25)$gene)
    }

    degs <- unique(unlist(degs, recursive=FALSE))

    modules <- GetModules(seurat_obj[[cell_type]])
    mods <- levels(modules$module)
    genome.size <- dim(seurat_obj[[cell_type]])[1]

    overlap_df <- do.call(rbind, lapply(mods, function(cur_mod){

    cur_genes <- modules %>% subset(module == cur_mod) %>% .$gene_name

    cur_overlap <- testGeneOverlap(newGeneOverlap(
        cur_genes,
        as.vector(degs),
        genome.size=genome.size
    ))

    cur_overlap <- data.frame(
      'odds.ratio' = cur_overlap@odds.ratio,
      'pval' = cur_overlap@pval,
      'Jaccard' = cur_overlap@Jaccard,
      'size_intersection' = length(cur_overlap@intersection),
      'module' = cur_mod
    )

      cur_overlap

    })) %>% as.data.frame()

    overlap_df <- overlap_df %>% mutate(fdr=p.adjust(pval, method='fdr'))
    overlap_df <- overlap_df %>% subset(module != 'grey')

    ################################################################################
    # Plot as a lollipop
    ################################################################################

    overlap_df$shape <- ifelse(overlap_df$fdr < 0.05, 21, 4)
    overlap_df <- overlap_df %>% arrange(odds.ratio, descending=TRUE)
    overlap_df$module <- factor(as.character(overlap_df$module), levels=as.character(overlap_df$module))

    mod_colors <- dplyr::select(modules, c(module, color)) %>%
      distinct
    cp <- mod_colors$color; names(cp) <- mod_colors$module

    p <- overlap_df %>%
      ggplot(aes(y=module, x=odds.ratio, size= size_intersection, color=module)) +
      geom_segment(aes(y=module, yend=module, x=0, xend=odds.ratio), size=0.5, color='grey') +
      geom_point() +
      geom_point(shape=overlap_df$shape, color='black', fill=NA) +
      scale_color_manual(values=cp, guide='none') +
      ylab('') + xlab("Odds ratio") +
      scale_x_continuous(breaks = seq(0, 19, 2)) +
      labs(size='Size\nintersection') +
      ggtitle('Overlap with DEGs') +

      theme(
        panel.border = element_rect(size=1, color='black', fill=NA),
        axis.line.y = element_blank(),
        axis.line.x = element_blank(),
        plot.title = element_text(hjust=0.5, face='plain')
      )

    pdf(paste0(fig_dir, test, '_', cell_type, '_consensus_DEG_overlap.pdf'), width=4, height=3.5)
    print(p)
    dev.off()
  }

}


[1] "Estimating AD gene overlap in hdWGCNA Experiment for INHIBITORY"
New names:
• `` -> `...1`
New names:
• `` -> `...1`
New names:
• `` -> `...1`


In [62]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

# **Save Seurat Object**

In [None]:
%%R -i seurat_obj -o seurat_obj

dat_dir = paste0("../results/hdWGCNA/SeuratObject/Consensus/")

if (!dir.exists(dat_dir)) {
  dir.create(dat_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){
    
    print(paste0('Saving hdWGCNA object in hdWGCNA Experiment for ', toupper(cell_type)))

    saveRDS(seurat_obj[[cell_type]], file=paste0(dat_dir, cell_type, '_hdWGCNA_object.rds'))

}

[1] "Saving hdWGCNA object in hdWGCNA Experiment for ASTROCYTE"
