In [1]:
import rpy2
import scipy
import logging
import warnings
import anndata
import anndata2ri
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import decoupler as dc
import scrublet as scr
import decoupler as dc
from scipy import sparse
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from typing import Optional, Union
from matplotlib.pyplot import rcParams
from functions import pathway_analyses
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.1
-----
OpenSSL                     22.0.0
PIL                         9.2.0
absl                        NA
anndata2ri                  1.1
appnope                     0.1.2
asttokens                   NA
astunparse                  1.6.3
attr                        21.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
boto3                       1.26.32
botocore                    1.29.32
bottleneck                  1.3.5
brotli                      NA
certifi                     2022.09.24
cffi                        1.15.1
cloudpickle                 2.2.0
colorama                    0.4.4
cryptography                38.0.1
cycler                      0.10.0
cython_runtime              NA
dask                        2022.11.0
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
decoupler                   1.4.0
defusedxml                  0.7.1
de

# **Consensus Network Enrichment Analysis**

Here, we perform nrichment analysis on the consensus co-expression network eusing `hdWGCNA`. 

- [**Morabito et al. bioRxiv 2022**](https://www.biorxiv.org/content/10.1101/2022.09.22.509094v1)
- [**Morabito & Miyoshi et al. Nature Genetics 2021**](https://www.nature.com/articles/s41588-021-00894-z)


In [4]:
%%R
suppressPackageStartupMessages({
    library(WGCNA)
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(conflicted)

    # single-cell analysis package
    library(Seurat)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)


    library(GSEABase)
    library(GSVA) 
# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
# enableWGCNAThreads(nThreads = 4)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

## **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

In [5]:
map_meta = True
deg_method =  'DESeq2-Wald'
test_names = ['late_vs_early', 'early_vs_no', 'late_vs_no', 'ad_vs_no']
filter_genes = "TRUE"
studies = ['allen_mtg', 'leng_sfg', 'leng_etc']
subject_id = {'allen_mtg': 'individualID',
             'leng_sfg': 'PatientID',
             'leng_etc': 'PatientID'}      # for leng this is `PatientID` for mathys is 'Subject', and 'individualID' for allen

gene_celltype_threshold = 0.05      # determines number of cells the gene must be expressed in 
covariates = ['None']               # list of covariates to be accounted for in regression.
gene_selection = 'custom'           # specifies the gene selection method when setting up seurat object for WGCNA. The th
celltypes = ["Inhibitory"]          #["Excitatory", "Inhibitory", "Astrocyte", "Microglia", "Oligodendrocyte", "OPC"]
gene_selection = 'custom'     # specifies the gene selection method when setting up seurat object for WGCNA. The th
                                # Posible values are "custom", "fraction", "variable"
                                # If custom, a list of genes must be passed.

gene_set_select = 'overlap'      # If gene_selection = 'custom'. This specifies how to obtain the list of
                                  # genes to pass into `SetupForWGCNA`. # The posible values are "diff_exp", "overlap", "all"

In [6]:
# convert nested list of anndata object into Rpy2 object 
subject_id =  robjects.ListVector(
                        {
                            save_prefix: subject_id[save_prefix]
                     
                            for save_prefix in studies
                        }
                    )

## **Load Seurat object for Network Visualizations**

In [7]:
%%R -i subject_id -i gene_celltype_threshold -i celltypes -i test_names -i studies -i gene_selection -i gene_set_select -o seurat_obj

seurat_obj <- list()

for (cell_type in celltypes) {
    
    print(paste0('Loading data for hdWGCNA Experiment in ', toupper(cell_type)))
    tryCatch({
        seurat_obj[[cell_type]] <- readRDS(paste0("../results/hdWGCNA/SeuratObject/Consensus/", cell_type, '_hdWGCNA_object.rds'))
        print(seurat_obj[[cell_type]])
    
    }, error = function(e){
        NULL
    }, message = function(m){
        print(paste0('Could not load data for ', toupper(cell_type)))
    })

}

print('loaded data')

[1] "Loading data for hdWGCNA Experiment in ASTROCYTE"
An object of class Seurat 
18168 features across 12621 samples within 1 assay 
Active assay: originalexp (18168 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap
[1] "loaded data"


In [8]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

# **Enrichment analysis**

Next, we perform enrichment tests on the hdWGCNA modules. We leverage the R pacakge enrichR to perform enrichment tests on a wide range of curated gene lists. This analysis should point us towards biological processes that our hdWGCNA modules are involved in. Additionally, we perform a gene set overlap analysis to compare the genes in hdWGCNA modules with the marker genes identified using Seurat’s FindAllMarkers function.


## **EnrichR**

hdWGCNA includes the function RunEnrichr to compare the set of genes in each module with any of the gene lists hosted by Enrichr.

The results of the enrichment tests are stored in the hdWGCNA experiment, so can be easily retrieved for downstream analysis or exporting to external applicaitons like Excel. In the following example, we perform the enrichment test with three Gene Ontology datbases:

- `GO_Biological_Process_2021`
- `GO_Cellular_Component_2021`
- `GO_Molecular_Function_2021`

In [9]:
%%R -i seurat_obj -o seurat_obj -o enrich_df

#dbs <- c('GO_Biological_Process_2021')

dbs <- c('GO_Biological_Process_2021','GO_Cellular_Component_2021','GO_Molecular_Function_2021')

enrich_df <- list()

for (cell_type in names(seurat_obj)){

  print(paste0('Estimating Module Enrichments in hdWGCNA Experiment for ', toupper(cell_type)))

  # enrichr databases to test

  # perform enrichment tests
  seurat_obj[[cell_type]] <- RunEnrichr(
    seurat_obj[[cell_type]],
    dbs=dbs, # character vector of enrichr databases to test
  )
  # retrieve the output table
  enrich_df[[cell_type]] <- GetEnrichrTable(seurat_obj[[cell_type]]) %>% subset(P.value < 0.05)

}

[1] "Estimating Module Enrichments in hdWGCNA Experiment for ASTROCYTE"
Selecting by kME_AST-M1
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2021... Done.
  Querying GO_Cellular_Component_2021... Done.
  Querying GO_Molecular_Function_2021... Done.
Parsing results... Done.
Selecting by kME_AST-M2
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2021... Done.
  Querying GO_Cellular_Component_2021... Done.
  Querying GO_Molecular_Function_2021... Done.
Parsing results... Done.
Selecting by kME_AST-M3
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2021... Done.
  Querying GO_Cellular_Component_2021... Done.
  Querying GO_Molecular_Function_2021... Done.
Parsing results... Done.


In [10]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Visualize enrichments**

Now that we have done the enrichment tests, we can now visualize the results.

### **EnrichrBarPlot**

hdWGCNA includes the function EnrichrBarPlot to summarize the results of every Enrichr database and every module. This function outputs a .pdf figure for each module, containing a barplot showing the top N enriched terms. The following example will plot the top 10 terms in each module and will output the results to a folder called enrichr_plots.

In [11]:
%%R -i seurat_obj -o seurat_obj

fig_dir = paste0("../results/hdWGCNA/Enrichr/Consensus/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){

  print(paste0('Enrichr Plots for Modules in hdWGCNA Experiment for ', toupper(cell_type)))

  # make GO term plots:
  EnrichrBarPlot(
    seurat_obj[[cell_type]],
    outdir = paste0(fig_dir, cell_type, "_consensus_enrichr_plots"), # name of output directory
    n_terms = 10, # number of enriched terms to show (sometimes more show if there are ties!!!)
    plot_size = c(5,7), # width, height of the output .pdfs
    logscale=TRUE # do you want to show the enrichment as a log scale?
  )
}

[1] "Enrichr Plots for Modules in hdWGCNA Experiment for ASTROCYTE"
[1] "AST-M1"
[1] "AST-M2"
[1] "AST-M3"


In [12]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

### **EnrichrDotPlot**

hdWGCNA includes an additional visualization function for enrichment results, EnrichrDotPlot, which shows the top results for one Enrichr database in each module. In the following example, we plot the top term in the GO_Biological_Process_2021 database.

In [13]:
%%R -i seurat_obj -o seurat_obj

conflicts_prefer(dplyr::select)

fig_dir = paste0("../results/hdWGCNA/Enrichr/Consensus")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){

    print(paste0('Enrichr Dot-Plots for Modules in hdWGCNA Experiment for ', toupper(cell_type)))

    # enrichr dotplot
    p <- EnrichrDotPlot(
    seurat_obj[[cell_type]],
        mods='all',
        database = c('GO_Biological_Process_2021','GO_Cellular_Component_2021','GO_Molecular_Function_2021'), 
        n_terms=3,
        break_ties=TRUE,
        )
        
    pdf(paste0(fig_dir, cell_type, "_consensus_enrichr_dotplots.pdf"), width=10, height=10, useDingbats=FALSE)
    print(p)
    dev.off()

}

[conflicted] Will prefer dplyr::select over any other package.
[1] "Enrichr Dot-Plots for Modules in hdWGCNA Experiment for ASTROCYTE"


In [14]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

# **Save Seurat Object**

In [15]:
%%R -i seurat_obj -o seurat_obj

dat_dir = paste0("../results/hdWGCNA/SeuratObject/Consensus/")

if (!dir.exists(dat_dir)) {
  dir.create(dat_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){
    
    print(paste0('Saving hdWGCNA object in hdWGCNA Experiment for ', toupper(cell_type)))

    saveRDS(seurat_obj[[cell_type]], file=paste0(dat_dir, cell_type, '_hdWGCNA_object.rds'))

}

[1] "Saving hdWGCNA object in hdWGCNA Experiment for ASTROCYTE"
