In [1]:
import rpy2
import scipy
import logging
import warnings
import anndata2ri
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import decoupler as dc
import scrublet as scr
import decoupler as dc
from scipy import sparse
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from typing import Optional, Union
from matplotlib.pyplot import rcParams
from functions import pathway_analyses
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.1
-----
OpenSSL                     22.0.0
PIL                         9.2.0
absl                        NA
anndata2ri                  1.1
appnope                     0.1.2
asttokens                   NA
astunparse                  1.6.3
attr                        21.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
boto3                       1.26.32
botocore                    1.29.32
bottleneck                  1.3.5
brotli                      NA
certifi                     2022.09.24
cffi                        1.15.1
cloudpickle                 2.2.0
colorama                    0.4.4
cryptography                38.0.1
cycler                      0.10.0
cython_runtime              NA
dask                        2022.11.0
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
decoupler                   1.4.0
defusedxml                  0.7.1
de

# **hdWGCNA Module Preservation**

In [4]:
%%R
suppressPackageStartupMessages({
    library(WGCNA)
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(conflicted)
    library(readxl)


    # single-cell analysis package
    library(Seurat)
    library(monocle3)
    library(SeuratWrappers)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)


    library(GSEABase)
    library(GSVA) 
# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
# enableWGCNAThreads(nThreads = 4)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

## **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

In [5]:
save_prefix = 'leng_etc'

query_studies = ['grubman_etc', 'mathys_pfc', 'allen_mca']
map_meta = True
deg_method =  'DESeq2-Wald'
test_names = ['late_vs_early', 'early_vs_no', 'late_vs_no', 'ad_vs_no']
filter_genes = "TRUE"

subject_ids_for_study = {'allen_mtg': 'individualID',
                        'leng_sfg': 'PatientID',
                        'leng_etc': 'PatientID'}
subject_id = subject_ids_for_study[save_prefix]     # for leng this is `PatientID` for mathys is 'Subject', and allen is 'individualID'



gene_celltype_threshold = 0.10          # determines number of cells the gene must be expressed in 
covariates = ['None']                   # list of covariates to be accounted for in regression.
gene_selection = 'custom'               # specifies the gene selection method when setting up seurat object for WGCNA. The th
celltypes = ["'Excitatory'"]  #["Excitatory", "Inhibitory", "Astrocyte", "Microglia", "Oligodendrocyte", "OPC"]
metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv' 
meta = pd.read_csv(metadata, encoding_errors='ignore')

gene_selection = 'custom'   # specifies the gene selection method when setting up seurat object for WGCNA. The th
                            # Posible values are "custom", "fraction", "variable"
                            # If custom, a list of genes must be passed.

gene_set_select = 'diff_exp'      # If gene_selection = 'custom'. This specifies how to obtain the list of
                                  # genes to pass into `SetupForWGCNA`. # The posible values are "diff_exp", "overlap", "all"



# **Reading in the Query data**

In [6]:
adata_query = {}

for key in query_studies:
    
    print('----------------------------')
    print(f'Fetching {key.upper()} data')
    print('----------------------------')

    adata_query[key] = sc.read_h5ad(f'../data/raw/{key}/{key}_raw_anndata.h5ad')
    print(adata_query[key])

----------------------------
Fetching GRUBMAN_ETC data
----------------------------
AnnData object with n_obs × n_vars = 13214 × 10850
    obs: 'sampleID', 'batch', 'patient', 'sex', 'nGene', 'nUMI', 'pctMT', 'cellType', 'batchCond', 'subclustCond', 'subIDm', 'subIDa', 'subIDn', 'subIDo', 'subIDO', 'subIDe', 'subIDu', 'subIDh', 'mg', 'astro', 'neuron', 'oligo', 'OPC', 'endo', 'UMAP1_ALL', 'UMAP2_ALL', 'PC1_ALL', 'PC2_ALL', 'PC3_ALL', 'subclustID', 'UMAP1_ct', 'UMAP2_ct', 'PC1_ct', 'PC2_ct', 'PC3_ct'
----------------------------
Fetching MATHYS_PFC data
----------------------------
AnnData object with n_obs × n_vars = 70634 × 17926
    obs: 'projid', 'tsne1', 'tsne2', 'pre.cluster', 'broad.cell.type', 'Subcluster', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp'

In [11]:
# send to R instance 

# convert nested list of Seurat object into Rpy2 object 
adata_query =  robjects.ListVector(
                        {
                            study: adata_query[study]
                     
                            for study in adata_query.keys()
                        }
                    )

## **Load Reference Seurat object for Module Preservations**

In [13]:
%%R -i subject_id -i gene_celltype_threshold -i celltypes -i test_names -i save_prefix -i gene_selection -i gene_set_select -o seurat_obj

# Function to handle failure in data loading
handle_loading_failure <- function(cell_type, seurat_obj) {
    cat(paste0('Could not load data for ', toupper(cell_type), '\n'))
    cat(paste0(toupper(cell_type), ' dropped from experiment\n'))
  
    # Remove the failed cell_type from the list
    seurat_obj <- seurat_obj[names(seurat_obj) != cell_type]
  
    # Update the modified seurat_obj in the global environment
    assign("seurat_obj", seurat_obj, envir = .GlobalEnv)
}

seurat_obj <- list()

for (cell_type in celltypes) {
  
    cat(paste0('Loading data for hdWGCNA Experiment in ', toupper(cell_type), '\n'))
  
    # Use tryCatch to load data and handle errors or messages
    tryCatch({
        # Attempt to read the RDS file into the list
        seurat_obj[[cell_type]] <- readRDS(paste0("../results/hdWGCNA/SeuratObject/", save_prefix, '/', cell_type, '_hdWGCNA_object.rds'))
        seurat_obj[[cell_type]]@meta.data$pathology.group2 <- ifelse(seurat_obj[[cell_type]]@meta.data$pathology.group != 'no', 'AD', 
                                                                    seurat_obj[[cell_type]]@meta.data$pathology.group)
        print(seurat_obj[[cell_type]])
        print('............')
  
    }, error = function(e){
        # Error handling: Remove the cell_type from the list and print an error message
        handle_loading_failure(cell_type, seurat_obj)
  
    }, message = function(m){
        handle_loading_failure(cell_type, seurat_obj)
    })
}

cat("Loaded available data\n")



[1] "Loading data for hdWGCNA Experiment in 'EXCITATORY'"
[1] "loaded data"


In [14]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Projecting modules from reference to query**

Here, we check whether the co-expresion modules detected in one (`reference`) dataset in external (`query`) datasets. Rather than building a new co-expression network from scratch in a new dataset, we can take the modules from the reference and project them into the query dataset. 

We project the modules from the reference dataset in each cell-type into the list query datasets provided in the list above. 


We assume that basic single-cell pipeline has to be done on the query dataset (normalization, scaling, variable features, PCA, batch correction, UMAP, clustering). First we make a UMAP plot to visualize the two datasets to ensure they have both been processed.


In [8]:
%%R -i seurat_obj -o seurat_obj -i ref_levels

# ref_levels[['Braak']] <- '0.0'

for (cell_type in names(seurat_obj)) {
  print(paste0('Estimating Module Trait Correlations in ', toupper(cell_type)))


  for (fact in cur_factors){
    
    seurat_obj[[cell_type]]@meta.data[[fact]] <- as.factor(seurat_obj[[cell_type]]@meta.data[[fact]])

    if (fact %in% names(ref_levels)) {
      seurat_obj[[cell_type]]@meta.data[[fact]] <- relevel(seurat_obj[[cell_type]]@meta.data[[fact]], ref = ref_levels[[fact]])
    }
  }

  for (numer in cur_numerics){
    seurat_obj[[cell_type]]@meta.data[[numer]] <- as.numeric(seurat_obj[[cell_type]]@meta.data[[numer]])
  }

  # list of traits to correlate
  cur_traits <- c(cur_factors, cur_numerics)

  seurat_obj[[cell_type]] <- ModuleTraitCorrelation(
    seurat_obj[[cell_type]],
    traits = cur_traits,
    group.by = 'cell_type'
  )
}

[1] "Estimating Module Trait Correlations in OPC"


1: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait Braak is a factor with levels 0, 2, 4, 5, 6. Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?
2: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait apoe4Status is a factor with levels N, Y. Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?
3: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait Cognitive.status is a factor with levels No dementia, Dementia. Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?
4: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait sex_y is a factor with levels female, male. Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?


In [9]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Inspecting the output**

We can run the function `GetModuleTraitCorrelation` to retrieve the output of this function.

In [10]:
%%R -i seurat_obj -o seurat_obj -o mt_cor

mt_cor <- list()

for (cell_type in names(seurat_obj)){

  # get the mt-correlation results
  mt_cor[[cell_type]] <- GetModuleTraitCorrelation(seurat_obj[[cell_type]])

}

In [11]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Plot Correlation Heatmaps**

We can plot the results of our correlation analysis using the `PlotModuleTraitCorrelation` function. This function creates a separate heatmap for each of the correlation matrices, and then assembles them into one plot using patchwork.

In [12]:
%%R -i seurat_obj -o seurat_obj

conflicts_prefer(dplyr::select)

fig_dir = paste0("../results/hdWGCNA/ModuleCorrelations/", save_prefix, '/')
if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){

  print(paste0('Plotting module Trait Correlations for ', toupper(cell_type)))

  pdf(paste0(fig_dir, cell_type, '_trait_correlations.pdf'), width=5, height=4)
  p <- PlotModuleTraitCorrelation(
      seurat_obj[[cell_type]],
      label = 'fdr',
      label_symbol = 'stars',
      text_size = 2,
      text_digits = 2,
      text_color = 'white',
      high_color = 'yellow',
      mid_color = 'black',
      low_color = 'purple',
      plot_max = 0.2,
      combine=TRUE
    )

  print(p)
  dev.off()

}

[conflicted] Will prefer dplyr::select over any other package.
[1] "Plotting module Trait Correlations for OPC"
[1] "all_cells"
[1] "Braak"            "apoe4Status"      "Cognitive.status" "sex_y"           
[5] "pmi"             
[1] "OPC"
[1] "Braak"            "apoe4Status"      "Cognitive.status" "sex_y"           
[5] "pmi"             


In [13]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

# **Save Seurat Object**

In [14]:
%%R -i seurat_obj -o seurat_obj

dat_dir = paste0("../results/hdWGCNA/SeuratObject/", save_prefix, '/')

if (!dir.exists(dat_dir)) {
  dir.create(dat_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){
    
    print(paste0('Saving hdWGCNA object in hdWGCNA Experiment for ', toupper(cell_type)))

    saveRDS(seurat_obj[[cell_type]], file=paste0(dat_dir, cell_type, '_hdWGCNA_object.rds'))

}

[1] "Saving hdWGCNA object in hdWGCNA Experiment for OPC"
