In [1]:
import rpy2
import scipy
import logging
import warnings
import anndata2ri
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import decoupler as dc
import scrublet as scr
import decoupler as dc
from scipy import sparse
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from typing import Optional, Union
from matplotlib.pyplot import rcParams
from functions import pathway_analyses
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
absl                        NA
anndata2ri                  1.1
appnope                     0.1.3
asttokens                   NA
attr                        23.1.0
backcall                    0.2.0
cffi                        1.15.1
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
decoupler                   1.4.0
deprecate                   0.3.2
dot_parser                  NA
executing                   1.2.0
fsspec                      2023.6.0
functions                   NA
google                      NA
gsva_prep                   NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2                      3.1.2
joblib        

# **hdWGCNA Module Trait Correlation**

In [4]:
%%R
suppressPackageStartupMessages({
    library(WGCNA)
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(conflicted)
    library(readxl)


    # single-cell analysis package
    library(Seurat)
    library(monocle3)
    library(SeuratWrappers)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)


    library(GSEABase)
    library(GSVA) 
# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
# enableWGCNAThreads(nThreads = 4)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

1: package ‘IRanges’ was built under R version 4.3.1 
2: package ‘GenomeInfoDb’ was built under R version 4.3.1 
3: package ‘GSVA’ was built under R version 4.3.1 


## **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

In [None]:
save_prefix = 'seaad_mtg'
map_meta = True
deg_method =  'DESeq2-Wald'
test_names = ['late_vs_early', 'early_vs_no', 'late_vs_no', 'ad_vs_no']
filter_genes = "TRUE"
subject_ids_for_study = {'allen_mtg': 'individualID',
                        'leng_sfg': 'PatientID',
                        'leng_etc': 'PatientID',
                        'seaad_mtg': 'individualID'}
subject_id = subject_ids_for_study[save_prefix]     # for leng this is `PatientID` for mathys is 'Subject', and allen is 'individualID'



gene_celltype_threshold = 0.10          # determines number of cells the gene must be expressed in 
covariates = ['None']                   # list of covariates to be accounted for in regression.
gene_selection = 'custom'               # specifies the gene selection method when setting up seurat object for WGCNA. The th
celltypes = ["OPC"]  #["Excitatory", "Inhibitory", "Astrocyte", "Microglia", "Oligodendrocyte", "OPC"]
metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv' 
meta = pd.read_csv(metadata, encoding_errors='ignore')

gene_selection = 'custom'   # specifies the gene selection method when setting up seurat object for WGCNA. The th
                            # Posible values are "custom", "fraction", "variable"
                            # If custom, a list of genes must be passed.

gene_set_select = 'diff_exp'      # If gene_selection = 'custom'. This specifies how to obtain the list of
                                  # genes to pass into `SetupForWGCNA`. # The posible values are "diff_exp", "overlap", "all"



correlate_factors = {'allen_mtg': ['Braak', 'Cognitive.status', 'CERAD', 'ADNC', 'Thal.phase'],
                    'leng_etc': ['BraakStage', 'pathology.group2', 'CERAD', 'ADNC', 'Thal Phase'],
                    'leng_sfg': ['BraakStage', 'pathology.group2', 'CERAD', 'ADNC', 'Thal Phase'],
                    'seaad_mtg': ['Braak', 'Cognitive Status', 'CERAD score', 'ADNC', 'Thal']}

cur_factors = correlate_factors[save_prefix]

correlate_numerics= {'allen_mtg': ['pmi',],
                    'leng_etc': ['pmi',],
                    'leng_sfg': ['pmi',],
                    'seaad_mtg': ['PMI'],}

cur_numerics = correlate_numerics[save_prefix]

covariate_levels = {'allen_mtg': {'Braak': ['0', '2', '4', '5', '6'],
                                  'Cognitive.status': ['No dementia', 'Dementia'],
                                  'CERAD': ['0', '2', '3'],
                                  'ADNC': ['Not AD', 'Intermediate', 'High'],
                                  'Thal.phase': ['Thal 0', 'That 3', 'That 4', 'That 5']},

                    'seaad_mtg': {'Braak': ['Braak 0', 'Braak II', 'Braak IV', 'Braak V', 'Braak VI'],
                                  'Cognitive Status': ['No dementia', 'Dementia'],
                                  'CERAD score': ['Absent', 'Moderate', 'Frequent'],
                                  'ADNC': ['Not AD', 'Intermediate', 'High'],
                                  'Thal': ['Thal 0', 'Thal 3', 'Thal 4', 'Thal 5']},

                    'leng_etc': {'BraakStage': ['0', '2', '6'],
                                 'pathology.group2': ['no', 'AD'],
                                 'ADNC': ['Not AD', 'Low', 'High'],
                                 'Thal Phase': ['Thal 0', 'Thal 1/2', 'Thal 3', 'Thal 4/5'], 
                                 'CERAD': ['0', '1', '2', '3']},

                    'leng_sfg': {'BraakStage': ['0', '2', '6'],
                                 'pathology.group2': ['no', 'AD'],
                                 'ADNC': ['Not AD', 'Low', 'High'],
                                 'Thal Phase': ['Thal 0', 'Thal 1/2', 'Thal 3', 'Thal 4/5'], 
                                 'CERAD': ['0', '1', '2', '3']},
             }

ref_levels = covariate_levels[save_prefix]

# convert nested dataframe into Rpy2 object 


ref_levels =  robjects.ListVector(
                        {
                            factor: ref_levels[factor]
                     
                            for factor in ref_levels.keys()
                        }
                    )

## **Load Seurat object for Network Visualizations**

In [None]:
%%R -i subject_id -i gene_celltype_threshold -i celltypes -i test_names -i save_prefix -i gene_selection -i gene_set_select -o seurat_obj

# Function to handle failure in data loading
handle_loading_failure <- function(cell_type, seurat_obj) {
    cat(paste0('Could not load data for ', toupper(cell_type), '\n'))
    cat(paste0(toupper(cell_type), ' dropped from experiment\n'))
  
    # Remove the failed cell_type from the list
    seurat_obj <- seurat_obj[names(seurat_obj) != cell_type]
  
    # Update the modified seurat_obj in the global environment
    assign("seurat_obj", seurat_obj, envir = .GlobalEnv)
}

seurat_obj <- list()

for (cell_type in celltypes) {
  
    cat(paste0('Loading data for hdWGCNA Experiment in ', toupper(cell_type), '\n'))
  
    # Use tryCatch to load data and handle errors or messages
    tryCatch({
        # Attempt to read the RDS file into the list
        seurat_obj[[cell_type]] <- readRDS(paste0("../results/hdWGCNA/SeuratObject/", save_prefix, '/', cell_type, '_hdWGCNA_object.rds'))
        seurat_obj[[cell_type]]@meta.data$pathology.group2 <- ifelse(seurat_obj[[cell_type]]@meta.data$pathology.group != 'no', 'AD', 
                                                                    seurat_obj[[cell_type]]@meta.data$pathology.group)
        print(seurat_obj[[cell_type]])
        print('............')
  
    }, error = function(e){
        # Error handling: Remove the cell_type from the list and print an error message
        handle_loading_failure(cell_type, seurat_obj)
  
    }, message = function(m){
        handle_loading_failure(cell_type, seurat_obj)
    })
}

cat("Loaded available data\n")



[1] "Loading data for hdWGCNA Experiment in OPC"
An object of class Seurat 
36601 features across 3895 samples within 1 assay 
Active assay: originalexp (36601 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap
[1] "loaded data"


In [None]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Compute correlations**

Here we use the function `ModuleTraitCorrelation` to correlate selected variables with module eigengenes. This function computes correlations for specified groupings of cells, since we can expect that some variables may be correlated with certain modules in certain cell groups but not in others. There are certain types of variables that can be used for this analysis while others should not be used.

**Variables that can be used**

- Numeric variables
- Categorical variables with only 2 categories, such as “control” and “condition”.
- Categorical variables with a sequential relationship. For example, you may have a “disease stage” category ordered by “healthy”, “stage 1”, “stage 2”, “stage 3”, etc. In this case, you must ensure that the variable is stored as a factor and that the levels are set appropriately.

**Variables that can not be used**

- Categorical variables with more than two categories that are not sequentially linked. For example, suppose you have a dataset consiting of three strains of transgenic mice and one control. Categorical variables must be converted to numeric before running the correlation, so you will end up with a correlation that is not at all biologically meaningful since there’s not a way to order the three different strains in a way that makes sense as a numeric variable. In this case, you should just set up a pairwise correlation between control and each strain separately. We often have a “Sample ID” variable indicating which cell came from which sample, and this is a variable that does not necessarily make sense to order in any particular way, so a variable like this would not be suitable for module-trait correlation analysis.

In [None]:
%%R -i seurat_obj -o seurat_obj -i ref_levels -i meta


#' ModuleTraitCorrelation'

for (cell_type in names(seurat_obj)) {
  
  print(paste0('Estimating Module Trait Correlations in ', toupper(cell_type)))

  for (fact in cur_factors){

    # Check if the 'factor' column already exists in the Seurat object metadata
    if (!(fact %in% colnames(seurat_obj[[cell_type]]@meta.data))) {
      # Create a mapping between 'donor_id' and 'fact' using the metadata data.frame
      donor_fact_map <- setNames(meta[[fact]], meta[[subject_id]])
      
      # Map the 'factor' values to the 'donor id' values in the Seurat object metadata
      seurat_obj[[cell_type]]@meta.data[[fact]] <- donor_fact_map[seurat_obj[[cell_type]]@meta.data[[subject_id]]]
    }

    seurat_obj[[cell_type]]@meta.data[[fact]] <- as.factor(seurat_obj[[cell_type]]@meta.data[[fact]])
    seurat_obj[[cell_type]]@meta.data[[fact]] <- factor(seurat_obj[[cell_type]]@meta.data[[fact]], levels=ref_levels[[fact]])

    if (fact %in% names(ref_levels)) {
        seurat_obj[[cell_type]]@meta.data[[fact]] <- relevel(seurat_obj[[cell_type]]@meta.data[[fact]], ref = ref_levels[[fact]][[1]])
      }

  }

  for (numer in cur_numerics){

    if (!(numer %in% colnames(seurat_obj[[cell_type]]@meta.data))){
      donor_numer_map <- setNames(meta[[numer]], meta[[subject_id]])

      seurat_obj[[cell_type]]@meta.data[[numer]] <- donor_numer_map[seurat_obj[[cell_type]]@meta.data[[subject_id]]]
    }
    
    seurat_obj[[cell_type]]@meta.data[[numer]] <- as.numeric(seurat_obj[[cell_type]]@meta.data[[numer]])

  }
  
  # list of traits to correlate
  cur_traits <- c(cur_factors, cur_numerics)
  
  seurat_obj[[cell_type]] <- ModuleTraitCorrelation(
    seurat_obj[[cell_type]],
    traits = cur_traits,
  )
}

[1] "Estimating Module Trait Correlations in OPC"


1: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait Braak is a factor with levels Braak 0, Braak II, Braak IV, Braak V, Braak VI. Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?
2: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait Cognitive Status is a factor with levels . Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?
3: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait CERAD score is a factor with levels Absent, Moderate, Frequent. Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?
4: In ModuleTraitCorrelation(seurat_obj[[cell_type]], traits = cur_traits,  :
  Trait ADNC is a factor with levels Not AD, Intermediate, High. Levels will be converted to numeric IN THIS ORDER for the correlation, is this the expected order?
5: In ModuleTraitC

In [None]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Inspecting the output**

We can run the function `GetModuleTraitCorrelation` to retrieve the output of this function.

In [None]:
%%R -i seurat_obj -o seurat_obj -o mt_cor

mt_cor <- list()

for (cell_type in names(seurat_obj)){

  # get the mt-correlation results
  mt_cor[[cell_type]] <- GetModuleTraitCorrelation(seurat_obj[[cell_type]])

}

In [None]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

## **Plot Correlation Heatmaps**

We can plot the results of our correlation analysis using the `PlotModuleTraitCorrelation` function. This function creates a separate heatmap for each of the correlation matrices, and then assembles them into one plot using patchwork.

In [None]:
%%R -i seurat_obj -o seurat_obj

conflicts_prefer(dplyr::select)

fig_dir = paste0("../results/hdWGCNA/ModuleCorrelations/", save_prefix, '/')
if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){

  print(paste0('Plotting module Trait Correlations for ', toupper(cell_type)))

  pdf(paste0(fig_dir, cell_type, '_trait_correlations.pdf'), width=5, height=4)
  p <- PlotModuleTraitCorrelation(
      seurat_obj[[cell_type]],
      label = 'fdr',
      label_symbol = 'stars',
      text_size = 2,
      text_digits = 2,
      text_color = 'white',
      high_color = 'yellow',
      mid_color = 'black',
      low_color = 'purple',
      plot_max = 0.2,
      combine=TRUE
    )

  print(p)
  dev.off()

}

[conflicted] Will prefer dplyr::select over any other package.
[1] "Plotting module Trait Correlations for OPC"
[1] "all_cells"
[1] "Braak"            "Cognitive Status" "CERAD score"      "ADNC"            
[5] "Thal"             "PMI"             
[1] "SingleCellExperiment"
[1] "Braak"            "Cognitive Status" "CERAD score"      "ADNC"            
[5] "Thal"             "PMI"             


In [None]:
# convert nested list of Seurat object into Rpy2 object 
seurat_obj =  robjects.ListVector(
                        {
                            cell_type: seurat_obj[cell_type]
                     
                            for cell_type in seurat_obj.keys()
                        }
                    )

# **Save Seurat Object**

In [None]:
%%R -i seurat_obj -o seurat_obj

dat_dir = paste0("../results/hdWGCNA/SeuratObject/", save_prefix, '/')

if (!dir.exists(dat_dir)) {
  dir.create(dat_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){
    
    print(paste0('Saving hdWGCNA object in hdWGCNA Experiment for ', toupper(cell_type)))

    saveRDS(seurat_obj[[cell_type]], file=paste0(dat_dir, cell_type, '_hdWGCNA_object.rds'))

}

[1] "Saving hdWGCNA object in hdWGCNA Experiment for OPC"
