In [1]:
import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)

In [2]:
import os
import rpy2
import logging
import warnings
import anndata2ri
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import decoupler as dc
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from itertools import chain
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from functions import helper_functions


In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

-----
anndata     0.11.1
scanpy      1.9.3
-----
PIL                         9.5.0
anndata2ri                  1.1
asttokens                   NA
backcall                    0.2.0
cffi                        1.15.1
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
decoupler                   1.4.0
dot_parser                  NA
exceptiongroup              1.1.1
executing                   1.2.0
functions                   NA
google                      NA
gsva_prep                   NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2                      3.1.2
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.39.1
louvain      

In [4]:
%%R

suppressPackageStartupMessages({
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(readxl)
    library(conflicted)
    library(dplyr)
    library(parallel)
    library(stringr)
    library(tibble)
    library(BiocParallel)

    # single-cell analysis package
    library(Seurat)
    library(zellkonverter)   
    library(SingleCellExperiment)
    library(tidyr)
    library(readxl)
    library(GSA)
    library(limma)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)
    library(ggplot2)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)
    library(GSEABase)
    library(GSVA) 

    # cell-cell communication
    library(nichenetr)

# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
enableWGCNAThreads(nThreads = 40)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    Allowing parallel execution with up to 40 working processes.


1: replacing previous import ‘GenomicRanges::intersect’ by ‘SeuratObject::intersect’ when loading ‘hdWGCNA’ 
2: replacing previous import ‘GenomicRanges::union’ by ‘dplyr::union’ when loading ‘hdWGCNA’ 
3: replacing previous import ‘GenomicRanges::setdiff’ by ‘dplyr::setdiff’ when loading ‘hdWGCNA’ 
4: replacing previous import ‘dplyr::as_data_frame’ by ‘igraph::as_data_frame’ when loading ‘hdWGCNA’ 
5: replacing previous import ‘Seurat::components’ by ‘igraph::components’ when loading ‘hdWGCNA’ 
6: replacing previous import ‘dplyr::groups’ by ‘igraph::groups’ when loading ‘hdWGCNA’ 
7: replacing previous import ‘dplyr::union’ by ‘igraph::union’ when loading ‘hdWGCNA’ 
8: replacing previous import ‘GenomicRanges::subtract’ by ‘magrittr::subtract’ when loading ‘hdWGCNA’ 
9: replacing previous import ‘Matrix::as.matrix’ by ‘proxy::as.matrix’ when loading ‘hdWGCNA’ 
10: replacing previous import ‘igraph::groups’ by ‘tidygraph::groups’ when loading ‘hdWGCNA’ 


## **Data Prep Parameters**

- `test_names`: List of the different test names of interest.

- `save_prefix`: Preferred prefix for saving critical files. Ideally chosen to be in the format `{source name}_{brain region}`. e.g `mathys_pfc`

- `subject_id`: Column name for Subject/Patient ID in both metadata and `.obs`

In [5]:
save_prefix = 'seaad_mtg'                                       # this takes the format '{StudyName}_{ThreeLetterAccronymForBrainRegion}'
subject_id = helper_functions.clean_strings('Donor ID')
cell_type_column = 'Subclass'                                   # 'Supertype (non-expanded)', 'Subclass'
factor = 'Continuous Pseudo-progression Score'    # pathology.group # Continuous Pseudoprogression Score
factor = helper_functions.clean_strings(factor, replace_hyphen=True)
test_names = ['late_vs_early']                                  # test categories
region_name = save_prefix.split('_')[-1].upper()
count_agg_strategy = 'metacell'   # options 'network', 'random', 'pseudobulk', 'smaller_network', 'standardbulk', 'metacell', `blanchardbulk``


data_dir = f'/media/tadeoye/Volume1/data/seq/SEA-AD/{region_name}/RNAseq/'
save_dir = f'../results/'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

subclass = {
    'excitatory': ['L5 IT', 'L2/3 IT', 'L4 IT', 'L6 IT', 'L6 IT Car3', 'L5/6 NP', 'L6b', 'L6 CT', 'L5 ET'],
    'inhibitory': ['Pvalb', 'Sst', 'Lamp5 Lhx6', 'Vip', 'Lamp5', 'Sncg', 'Chandelier', 'Sst Chodl', 'Pax6'],
    'astrocyte': ['Astrocyte'],
    'microglia': ['Microglia-PVM'],
    'opc': ['OPC'],
    'oligodendrocyte': ['Oligodendrocyte'],
    'endothelial': ['Endothelial'],
    'vlmc': ['VLMC'],
    }

cell_supertypes = list(chain(*list(subclass.values())))
Subclass = robjects.ListVector(subclass)


# **Load and Merge Pseudo-bulked Data**

We have previously standardized and pseudo-bulked data for each cell sypertype (`00_Standardize_and_Split_RNAseq.ipynb`, `01_pseuobulking.ipynb`). First, we load the data for each cell supertype and merge them into a single seurat object. 

In [6]:
clean_subclass = robjects.ListVector({k: [helper_functions.clean_strings(v, preserve_case=True) for v in val] for k, val in subclass.items()})
clean_cell_supertypes = [helper_functions.clean_strings(cell_type, preserve_case=True) for cell_type in cell_supertypes]

In [7]:
%%R -i count_agg_strategy -i data_dir -i subject_id -i Subclass -i cell_supertypes -i clean_cell_supertypes -i clean_subclass -o summed_counts_per_celltype

source('../scripts/functions/helper_functions.r')

summed_counts_per_celltype <- list()
expressed_genes_per_celltype <- list()
subclass <- Subclass

for (cell_type in seq_along(cell_supertypes)) {
  tryCatch({
    # Read summed counts and expressed genes
    summed_counts_per_celltype[[cell_supertypes[cell_type]]] <- readRDS(
    file.path(
      data_dir,
      "counts",
      find_parent_key(cell_supertypes[cell_type], subclass),
      clean_strings(cell_supertypes[cell_type], preserve_case = TRUE),
      paste0(clean_strings(cell_supertypes[cell_type], preserve_case = TRUE), "_", count_agg_strategy,"_count_data.rds")
    )
  )
  
  # Save expressed genes
  expressed_genes_per_celltype[[cell_supertypes[cell_type]]] <- readRDS(
    file.path(
      data_dir,
      "counts", 
      find_parent_key(cell_supertypes[cell_type], subclass),
      clean_strings(cell_supertypes[cell_type], preserve_case = TRUE),
      paste0(clean_strings(cell_supertypes[cell_type], preserve_case = TRUE), "_", count_agg_strategy,"_expressed_genes.rds")
    )
  )
  },
  error = function(e) {
    warning(sprintf("PSEUDOBULKED EXPRESSION DATA NOT FOUND FOR '%s'. PLEASE CONFIRM THAT THIS IS INTENTIONAL. SKIPPING %s", 
                   cell_supertypes[cell_type], cell_supertypes[cell_type]))
  })
}

1: In gzfile(file, "rb") :
  cannot open compressed file '/media/tadeoye/Volume1/data/seq/SEA-AD/MTG/RNAseq//counts/excitatory/L5_ET/L5_ET_metacell_count_data.rds', probable reason 'No such file or directory'
2: In value[[3L]](cond) :
  PSEUDOBULKED EXPRESSION DATA NOT FOUND FOR 'L5 ET'. PLEASE CONFIRM THAT THIS IS INTENTIONAL. SKIPPING L5 ET
3: In gzfile(file, "rb") :
  cannot open compressed file '/media/tadeoye/Volume1/data/seq/SEA-AD/MTG/RNAseq//counts/inhibitory/Sst_Chodl/Sst_Chodl_metacell_count_data.rds', probable reason 'No such file or directory'
4: In value[[3L]](cond) :
  PSEUDOBULKED EXPRESSION DATA NOT FOUND FOR 'Sst Chodl'. PLEASE CONFIRM THAT THIS IS INTENTIONAL. SKIPPING Sst Chodl
5: In gzfile(file, "rb") :
  cannot open compressed file '/media/tadeoye/Volume1/data/seq/SEA-AD/MTG/RNAseq//counts/microglia/Microglia_PVM/Microglia_PVM_metacell_count_data.rds', probable reason 'No such file or directory'
6: In value[[3L]](cond) :
  PSEUDOBULKED EXPRESSION DATA NOT FOUND FOR

In [8]:
adata_annot = sc.read_h5ad(os.path.join(data_dir, f'counts/anndata/{save_prefix.upper()}_RNAseq_final-nuclei.2024-02-13.h5ad'))
adata_annot.X = adata_annot.layers['UMIs'] 

In [9]:
for cell_type in summed_counts_per_celltype.keys():
        
        pseudo_genes_detected = helper_functions.update_pseudo_numeric_covariates(
            summed_counts_per_celltype[cell_type], 
            adata_annot = adata_annot, 
            covariate='Genes detected', 
            agg_method = 'count'
            )

        pseudo_number_of_umis = helper_functions.update_pseudo_numeric_covariates(
            summed_counts_per_celltype[cell_type], 
            adata_annot = adata_annot, 
            covariate='Number of UMIs', 
            agg_method = 'sum'
            )
        
        summed_counts_per_celltype[cell_type].obs['pseudo_genes_detected'] = (pseudo_genes_detected - pseudo_genes_detected.min())/\
                                                            (pseudo_genes_detected.max() - pseudo_genes_detected.min())
        summed_counts_per_celltype[cell_type].obs['pseudo_number_of_umis'] = (pseudo_number_of_umis - pseudo_number_of_umis.min())/\
                                                            (pseudo_number_of_umis.max() - pseudo_number_of_umis.min())

In [None]:
adata_merged = ad.concat([summed_counts_per_celltype[key] for key in summed_counts_per_celltype.keys()], join='outer')
try:
    adata_merged.X = adata_merged.X.toarray()
except:
    pass

mixed_columns = []
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        mixed_columns.append(col)

for col in mixed_columns:
    try:
        adata_merged.obs[col] = pd.to_numeric(adata_merged.obs[col])
    except:
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)
        
adata_merged.obs[cell_type_column] = adata_merged.obs[cell_type_column].astype('category')    

# adata_merged.X = csr_matrix(adata_merged.X)

# for layer in adata_merged.layers:
#     adata_merged.layers[layer] = csr_matrix(adata_merged.layers[layer])
    
adata_merged.write_h5ad(data_dir+f'counts/anndata/all_subclass_{count_agg_strategy}_anndata.h5ad', compression='gzip')



In [12]:
adata_merged

AnnData object with n_obs × n_vars = 111258 × 36601
    obs: 'donor_id', 'orig.ident', 'nCount_originalexp', 'nFeature_originalexp', 'cells_merged', 'Supertype', 'ident', 'method', 'Subclass', 'Supertype..non.expanded.', 'continuous_pseudo_progression_score', 'age_at_death_binned_codes', 'sex', 'race_choice_white', 'genes_detected', 'number_of_umis', 'pmi', 'apoe4_status', 'batch', 'pseudo_genes_detected', 'pseudo_number_of_umis'
    layers: 'logcounts'