In [1]:
import os
import rpy2
import logging
import warnings
import anndata2ri
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import decoupler as dc
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from itertools import chain
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from functions import helper_functions

In [2]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
anndata2ri                  1.1
asttokens                   NA
backcall                    0.2.0
cffi                        1.15.1
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
decoupler                   1.4.0
executing                   1.2.0
functions                   NA
google                      NA
gsva_prep                   NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2                      3.1.2
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.39.1
louvain                     0.8.0
markupsafe                  2.1.3
matplotlib 

In [3]:
%%R

suppressPackageStartupMessages({
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(readxl)
    library(conflicted)
    library(dplyr)
    library(parallel)
    library(stringr)
    library(BiocParallel)

    # single-cell analysis package
    library(Seurat)
    library(zellkonverter)   
    library(SingleCellExperiment)
    library(tidyr)
    library(readxl)
    library(GSA)
    library(limma)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)
    library(ggplot2)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)
    library(GSEABase)
    library(GSVA) 

    # cell-cell communication
    library(nichenetr)

# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
enableWGCNAThreads(nThreads = 40)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    Allowing parallel execution with up to 40 working processes.


1: replacing previous import ‘GenomicRanges::intersect’ by ‘SeuratObject::intersect’ when loading ‘hdWGCNA’ 
2: replacing previous import ‘GenomicRanges::union’ by ‘dplyr::union’ when loading ‘hdWGCNA’ 
3: replacing previous import ‘GenomicRanges::setdiff’ by ‘dplyr::setdiff’ when loading ‘hdWGCNA’ 
4: replacing previous import ‘dplyr::as_data_frame’ by ‘igraph::as_data_frame’ when loading ‘hdWGCNA’ 
5: replacing previous import ‘Seurat::components’ by ‘igraph::components’ when loading ‘hdWGCNA’ 
6: replacing previous import ‘dplyr::groups’ by ‘igraph::groups’ when loading ‘hdWGCNA’ 
7: replacing previous import ‘dplyr::union’ by ‘igraph::union’ when loading ‘hdWGCNA’ 
8: replacing previous import ‘GenomicRanges::subtract’ by ‘magrittr::subtract’ when loading ‘hdWGCNA’ 
9: replacing previous import ‘Matrix::as.matrix’ by ‘proxy::as.matrix’ when loading ‘hdWGCNA’ 
10: replacing previous import ‘igraph::groups’ by ‘tidygraph::groups’ when loading ‘hdWGCNA’ 


# **Systematic differential analysis of pathway activity**

Pathway activity scores were computed in accordance with the overall protocol outlined in [**Joel W. Blanchard et. al.**](https://doi.org/10.1038/s41586-022-05439-w).

- Aggregate gene expression profiles into `pseudo-cells` or `pseudo-bulks` depending on the desired strategy in **[Gazestani. et. al. 2023](https://www.sciencedirect.com/science/article/pii/S0092867423008590?via%3Dihub)** (`pseudo-cells`), [**Joel W. Blanchard et. al.**](https://doi.org/10.1038/s41586-022-05439-w) (`pseudo-bulks`), or [**Morabito et al.**](https://www.sciencedirect.com/science/article/pii/S2667237523001273?via%3Dihub) (`meta-cells`)
 

- Use `PAGODA2`, `GSVA`, or `AUCell`, to obtain pathway activity scores for pathways (`GO Ontology Biological Processes`) obtained from [**Maayan Lab**](https://maayanlab.cloud/Enrichr/#libraries), [**MSigDB**](https://www.gsea-msigdb.org/gsea/msigdb/human/collections.jsp#C5) or custom gene sets.

- Perform differential pathway activity analysis using limma

## **Data Prep Parameters**

- `test_names`: List of the different test names of interest.

- `save_prefix`: Preferred prefix for saving critical files. Ideally chosen to be in the format `{source name}_{brain region}`. e.g `mathys_pfc`

- `subject_id`: Column name for Subject/Patient ID in both metadata and `.obs`

In [4]:
save_prefix = 'seaad_mtg'                                       # this takes the format '{StudyName}_{ThreeLetterAccronymForBrainRegion}'
subject_id = helper_functions.clean_strings('Donor ID')
cell_type_column = 'Subclass'                                   # 'Supertype (non-expanded)', 'Subclass'
factor = 'pathology.group'                                      # pathology.group # Continuous Pseudoprogression Score
factor = factor.replace(" ", "").replace("-", "")
test_names = ['late_vs_early']                                  # test categories
region_name = save_prefix.split('_')[-1].upper()

data_dir = f'/media/tadeoye/Volume1/data/seq/SEA-AD/{region_name}/RNAseq/'
save_dir = f'../results/'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

subclass = {
    'excitatory': ['L5 IT', 'L2/3 IT', 'L4 IT', 'L6 IT', 'L6 IT Car3', 'L5/6 NP', 'L6b', 'L6 CT', 'L5 ET'],
    'inhibitory': ['Pvalb', 'Sst', 'Lamp5 Lhx6', 'Vip', 'Lamp5', 'Sncg', 'Chandelier', 'Sst Chodl', 'Pax6'],
    'astrocyte': ['Astrocyte'],
    'microglia': ['Microglia-PVM'],
    'opc': ['OPC'],
    'oligodendrocyte': ['Oligodendrocyte'],
    'endothelial': ['Endothelial'],
    'vlmc': ['VLMC'],
    }

cell_supertypes = list(chain(*list(subclass.values())))

# **Load and Merge Pseudo-bulked Data**

We have previously standardized and pseudo-bulked data for each cell sypertype (`00_Standardize_and_Split_RNAseq.ipynb`, `01_pseuobulking.ipynb`). First, we load the data for each cell supertype and merge them into a single seurat object. 

In [5]:
clean_subclass = robjects.ListVector({k: [helper_functions.clean_strings(v, preserve_case=True) for v in val] for k, val in subclass.items()})
clean_cell_supertypes = [helper_functions.clean_strings(cell_type, preserve_case=True) for cell_type in cell_supertypes]

In [6]:
%%R -i data_dir -i subject_id -i cell_supertypes -i clean_cell_supertypes -i clean_subclass -o summed_counts_per_celltype

source('../scripts/functions/helper_functions.r')

summed_counts_per_celltype <- list()
expressed_genes_per_celltype <- list()

for (cell_type in clean_cell_supertypes) {
  
  tryCatch({
    # Read summed counts and expressed genes
    summed_counts_per_celltype[[cell_type]] <- readRDS(
      file.path(
        data_dir,
        "anndata",
        find_parent_key(cell_type, clean_subclass),
        cell_type,
        paste0(cell_type, "_pseudobulked_count_data.rds")
      )
    )
   
    expressed_genes_per_celltype[[cell_type]] <- readRDS(
      file.path(
        data_dir,
        "anndata",
        find_parent_key(cell_type, clean_subclass),
        cell_type,
        paste0(cell_type, "_pseudobulked_expressed_genes.rds")
      )
    )      
  },
  error = function(e) {
    warning(sprintf("PSEUDOBULKED EXPRESSION DATA NOT FOUND FOR '%s'. PLEASE CONFIRM THAT THIS IS INTENTIONAL. SKIPPING %s", 
                   cell_type, cell_type))
  })
}

1: In gzfile(file, "rb") :
  cannot open compressed file '/media/tadeoye/Volume1/data/seq/SEA-AD/MTG/RNAseq//anndata/excitatory/L5_ET/L5_ET_pseudobulked_count_data.rds', probable reason 'No such file or directory'
2: In value[[3L]](cond) :
  PSEUDOBULKED EXPRESSION DATA NOT FOUND FOR 'L5_ET'. PLEASE CONFIRM THAT THIS IS INTENTIONAL. SKIPPING L5_ET
3: In gzfile(file, "rb") :
  cannot open compressed file '/media/tadeoye/Volume1/data/seq/SEA-AD/MTG/RNAseq//anndata/inhibitory/Sst_Chodl/Sst_Chodl_pseudobulked_count_data.rds', probable reason 'No such file or directory'
4: In value[[3L]](cond) :
  PSEUDOBULKED EXPRESSION DATA NOT FOUND FOR 'Sst_Chodl'. PLEASE CONFIRM THAT THIS IS INTENTIONAL. SKIPPING Sst_Chodl


In [7]:
adata_merged = ad.concat([summed_counts_per_celltype[key] for key in summed_counts_per_celltype.keys()], join='outer')
adata_merged.X = adata_merged.X.toarray()
adata_merged

AnnData object with n_obs × n_vars = 150897 × 36601
    obs: 'donor_id', 'orig.ident', 'nCount_originalexp', 'nFeature_originalexp', 'cells_merged', 'Subclass', 'ident', 'method', 'continuous_pseudo_progression_score', 'age_at_death_binned_codes', 'sex', 'race_choice_white', 'genes_detected', 'number_of_umis', 'pmi', 'apoe4_status'
    layers: 'logcounts'

In [8]:
%%R -i adata_merged

print('Merged data loaded')
adata_merged

[1] "Merged data loaded"
class: SingleCellExperiment 
dim: 36601 150897 
metadata(0):
assays(2): X logcounts
rownames(36601): MIR1302-2HG FAM138A ... AC007325.4 AC007325.2
rowData names(0):
colnames(150897): L5 IT#H19.33.004_1 L5 IT#H19.33.004_2 ...
  VLMC#H21.33.027_8 VLMC#H21.33.027_9
colData names(16): donor_id orig.ident ... pmi apoe4_status
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):


In [9]:
del adata_merged, summed_counts_per_celltype

# **Pathway Database**

First, we load pre-defined pathway annotations from established databases and then filter these databases for expressed genes.

In [10]:
database_dir = data_dir.replace(f"/seq/SEA-AD/{region_name}/RNAseq/", "/pathway_databases")

### **Curated Gene Sets Obtained From [Galea et al. 2022](https://doi.org/10.1016/j.nbd.2022.105655)**

In [11]:
galea_curated = pd.read_excel(f'{database_dir}/galea_2022_curated_gene_sets/gene_sets.xlsx', )
galea_curated['Gene set'] = galea_curated['Functional categories'] + ': ' + galea_curated['Functional subcategories']
galea_curated.rename(columns={'Gene symbol': 'Gene'}, inplace=True)
gl_gmt = helper_functions.save_curated_as_gmt(galea_curated, name_col='Gene set', member_col='Gene', 
                                              tempdir=f'{database_dir}/galea_2022_curated_gene_sets/')
gl_gmt_path = f'{database_dir}/galea_2022_curated_gene_sets/gs.gmt'

### **Curated Gene Sets Obtained From [Gabitto et al. 2022](https://doi.org/10.1101/2023.05.08.539485)**

In [12]:
gabitto_curated = pd.read_excel(f'{database_dir}/gabitto_2023_curated_gene_sets/gene_sets.xlsx')
gb_gmt = helper_functions.save_curated_as_gmt(gabitto_curated, name_col='Gene set', member_col='Gene', 
                                              tempdir=f'{database_dir}/gabitto_2023_curated_gene_sets/')
gb_gmt_path = f'{database_dir}/gabitto_2023_curated_gene_sets/gs.gmt'

### **Curated Synaptic Related Gene Sets Obtained From [Koopmans et al. 2019](https://www.sciencedirect.com/science/article/pii/S0896627319304271?via%3Dihub)**

In [13]:
syngo_curated = pd.read_excel(f'{database_dir}/SynGO_bulk_download_release_20231201/syngo_annotations.xlsx')
syngo_curated.rename(columns={'go_name': 'Gene set', 'hgnc_symbol': 'Gene'}, inplace=True)
syngo_gmt = helper_functions.save_curated_as_gmt(syngo_curated, name_col='Gene set', member_col='Gene', 
                                                 tempdir=f'{database_dir}/SynGO_bulk_download_release_20231201/')
syngo_gmt_path = f'{database_dir}/SynGO_bulk_download_release_20231201/gs.gmt'

### **Curated Gene Sets Obtained From Human Molecular Signatures Database ([MSigDB](https://www.gsea-msigdb.org/gsea/msigdb/human/collections.jsp#C5))**
- **[Reactome](https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=CP:REACTOME)**
- **[KEGG](https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=CP:KEGG_LEGACY)**
- **[Gene Ontology: Biological Processes](https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=GO:BP)**
- **[TFT: transcription factor targets](https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=TFT)**

In [14]:
bp_gmt_path = f'{database_dir}/MSigDB/c5.go.bp.v2024.1.Hs.symbols.gmt'                  # CP: Canonical pathways BP: subset of GO
ke_gmt_path = f'{database_dir}/MSigDB/c2.cp.kegg_legacy.v2024.1.Hs.symbols.gmt'         # CP: Canonical pathwaysKEGG_LEGACY subset of CP
re_gmt_path = f'{database_dir}/MSigDB/c2.cp.reactome.v2024.1.Hs.symbols.gmt'            # CP: Canonical pathways Reactome subset of CP
tft_gtrd_gmt_path = f'{database_dir}/MSigDB/c3.tft.gtrd.v2024.1.Hs.symbols.gmt'         # TFT: transcription factor targets: GTRD subset of TFT
tft_legc_gmt_path = f'{database_dir}/MSigDB/c3.tft.tft_legacy.v2024.1.Hs.symbols.gmt'   # TFT: transcription factor targets: TFT_LEGACY subset of TFT

# **Pathway Activity Scoring**

We now implement the pathway activity soring pipeline employed by **[Yaru Zhang et al. 2020](https://www.sciencedirect.com/science/article/pii/S2001037020304293#s0135)**. 

Briefly, **[Zhang et al](https://www.sciencedirect.com/science/article/pii/S2001037020304293#s0135)** performed a systematic benchmarking study to evaluate the performance of unsupervised pathway activity score(`PAS`) transformation algorithms. Essentially, PAS algorithms tranform the gene-level data into explainable gene sets representing biological processes or pathways to uncover the potential mechanism of cell heterogeneity. 

The tools benchmarked in the study include : 

> *Please hover over each method name to view full description*

- [`AUCell`](https://www.nature.com/articles/nmeth.4463 "AUCell is a rank-based enrichment analysis method for quantifying gene set activity in single-cell transcriptomics data. The algorithm employs a recovery-based scoring approach that first ranks genes according to their expression magnitude within individual cells. Subsequently, it computes an area under the recovery curve (AUC) by assessing the fraction of gene set members among the top-ranked genes, thereby generating cell-specific enrichment scores that represent the PAS"): Area under the ranked gene expression curve.

- [`Vision`](https://doi.org/10.1038/s41467-019-12235-0 "Vision is an annotation toolkit that uses autocorrelation statistics to identify biological variations across cells. Vision starts to identify closest K-nearest neighbors of each cell for generating a cell–cell K-nearest-neighbor (KNN) graph. PASs in Vision are calculated by averaging expressed genes for each gene set. To account for the influence of sample-level metrics (the number of UMIs/reads per cells), PASs are then corrected by their means and standard deviations. Expression data used in Vision could be scaled and normalized, but not log-transformed."): Summarizing the noermalized expression of genes in the gene sets.

- [`Pagoda2`](https://doi.org/10.1038/nbt.4038.Integrative "Pathway and gene set overdispersion analysis (Pagoda2) implements a pathway overdispersion analysis method for single-cell data. It first evaluates the variance of individual genes, taking into account technical factors. It then identifies pathways showing coordinated variability by combining the variance patterns of pathway genes using weighted PCA, while controlling for technical aspects and adjusting for multiple hypothesis testing."): First principal component of gene sets

- [`GSVA`](https://doi.org/10.1186/1471-2105-14-7 "GSVA implements a non-parametric, unsupervised method for estimating variation in pathway activity across samples. It first estimates a normalized relative expression level per gene across samples. This expression level is then rank-ordered for each sample and aggregated into gene sets by calculating sample-wise enrichment scores using a Kolmogorov–Smirnov-like rank statistic."): Kolmogorov-Smirnov-like rank statistic based on kernel estimation of th cumulative density.

- [`ssGSEA`](https://doi.org/10.1038/nature08460.Systematic "Briefly, ssGSEA implements a rank-based method to generate an enrichment score per sample per gene set. It first ranks all genes by their expression value for each sample. Then, for each gene set, it calculates an enrichment score that represents the degree of absolute enrichment of that gene set in each sample based on the integration of the differences between empirical cumulative distribution functions of gene expression ranks."): Kolmogorov-Smirnov-like rank statistic based on gene expression of single sample. 

- [`Combined z-score`](https://doi.org/10.1371/journal.pcbi.1000217 "This method implements a parametric method for pathway activity estimation. It first standardizes gene expression values across samples to generate z-scores for each gene. These gene-level z-scores are then aggregated for each pathway by calculating their mean or sum, producing a pathway-level score that represents the overall activation state of the pathway in each sample."): Combinzed z-score.

## **Method Selection for PAS**

**[Zhang et al.](https://www.sciencedirect.com/science/article/pii/S2001037020304293#s0135)** identified `Pagoda2` and `Vision` as the highest performing methods for pathway activity scoring, likely due to their direct estimation of pathway activity scores (PAS) from the expression data.

<figure align="center">
   <img src="../docs/pas_performance_summary.jpg" alt="Pathway Activity Score Performance Summary" width="30%">
   <figcaption><b>Figure 1:</b> Comparative performance evaluation of pathway activity scoring methods across multiple metrics</figcaption>
</figure>

Based on these findings, our analysis focuses on the three top-performing methods `Pagoda2`, `Vision`, and `AUCell`, inlcuding `GSVA` following our previous work ([**Adeoye et al. 2024**](https://www.aginganddisease.org/EN/10.14336/AD.2024.0429)). However, we here elect to use only `AUCell` as the other listed methods do not scale well with such large data.

> **Note**: Users preferring GSVA can still efficiently apply it by first setting `count_agg_strategy = pseudobulk` in `01_pseudobulking.ipynb` to use standard pseudobulking instead of metacells. But we doubt this will yield reliable results.

The authors provide the [PASBench](https://github.com/sulab-wmu/PASBench/tree/master) package that contain the seven PAS tools and evaluation metrics.

## **Parameter Selection for [PASBench](https://github.com/sulab-wmu/PASBench/tree/master)**

The following parameters control the calculation of pathway activity scores from single-cell gene expression data:

#### **Core Parameters**
- `counts`: the pseudobulked SingleCellEXperiment object. Values must contain expression `counts` assay/layer.

- `gmt_file`: Path to GMT format file containing pathway definitions.

- `species`: Species designation for pathway analysis. Options: `"human"` or `"mouse"`.

- `pathway`: Abbreviated name of pathway database (e.g., `KEGG`, `GO`, `REACTOME`). Only called when gmt_file is `none`

- `tool`: Specifies the analysis method to use. In this case, any of `Pagoda2` or `AUCell`
 > *See [method descriptions above](#) for details on each tool*

- `filter`: Controls expression-based gene filtering. When `TRUE`, removes genes expressed in <5% of cells. When `FALSE`, retains all genes. We already filtered genes before pseudobulking, so this **[Zhang et al.](https://www.sciencedirect.com/science/article/pii/S2001037020304293#s0135)** will be set to `FALSE`

- `normalize`: Specifies data normalization method. This is required to make expression values comparable across cells. The specified method should be appropriate for chosen tool. Options are `log`,`CLR`(Centered Log Ratio in Seurat),`RC` (Relative Counts in Seurat),`scran`,`scTransform`,`none`.

The figure below shows the distribution of averaged performance score across normalization strategies from **[Zhang et al.](https://www.sciencedirect.com/science/article/pii/S2001037020304293#s0135)**. 

<figure align="center">
   <img src="../docs/performance_by_normalization_strategy.jpg" alt="Averaged Performance Score Across Filtering and Normalization Strategies" width="50%">
   <figcaption><b>Figure 2:</b> Comparative performance evaluation of pathway activity scoring methods across filtering and normalization strategies</figcaption>
</figure>

This figure shows that:

- `Pagoda2` and `Vision` display the same performance irrespective of the chosen normalization method. Hence, we perform no normalization

- perfomance of `AUCell` depends on the normalization strategy. So, we choose [scTransform](https://github.com/satijalab/sctransform) normalization, since the other tools all display better performance with this method.

- For `GSVA`, we recommend setting the the normalization method to none, as the default `assay` chosen from the `SingleCellExperiment` object is `logcount` appropriate for the `Gaussian` kernel applied to `log CPM` data. User can likewise change parameters like `maxDiff`, `min/max_set_size`, and `kcdf`. 

In [None]:
pas_methods = ['AUCell']
normalization_methods = robjects.ListVector({'AUCell': 'none'})
assay_name = 'logcounts'    # assay name in pseudobulked singlecellexperiment object. 
                            # If set to `logcounts` then `kcdf` must be set to `Gaussian` and normalization method must be 'none'
kcdf = 'Gaussian'           # determines the kernel density used to model gene expression across samples
max_set_size = 150          # Upper bound of gene set size
min_set_size = 5            # lower bound of gene set size
maxDiff = 'TRUE'            # Controls how enrichment score is calculated from the Kolmogorov-Smirnov random walk statistics
                            # controls which curated gene set database the statistic is estimated for
n_cores = 40

curated_gene_set_db = robjects.ListVector({
    'gabitto': gb_gmt_path,
    'galea': gl_gmt_path,
    'syngo': syngo_gmt_path,
    'gobp': bp_gmt_path,
    'reactome': re_gmt_path,
    'kegg': ke_gmt_path, 
    'gtrd': tft_gtrd_gmt_path,
    'tft': tft_legc_gmt_path,
    })       

#### **Output Structure**
The implemented algorithms generates two complementary data structures:

1. A dictionary containing pathway activity scores for each gene set and method defined in `curated_gene_set_db`
2. A SingleCellExperiment object encoding a matrix of pathway activity scores (PAS) across cell types, where:
   - Rows represent individual pathways
   - Columns represent distinct cell types
   - Matrix elements contain non-/normalized PAS

The SingleCellExperiment object maintains the relationship between PAS and cellular identities while preserving the original experimental design structure. This allows us to perform downstream analysis on activuty scores like clustering and visualizing PAS in cellular populations.

In [None]:
%%R -i max_set_size -i min_set_size -i kcdf -i maxDiff -i curated_gene_set_db -i pas_methods -i n_cores -i normalization_methods -o pas_scores -o pas_object

source('../scripts/functions/PASBench/R/main.R')
source('../scripts/functions/helper_functions.r')

pas_scores = list()
pas_object = list()
gene_sets <- curated_gene_set_db

for (gs in names(gene_sets)){
    pas_scores[[gs]] <- list()
    pas_object[[gs]] <- list()

    all_gs_genes <- unique(unlist(read.geneset(gene_sets[[gs]])))
    current_genes <- rownames(adata_merged)
    genes_to_keep <- current_genes %in% all_gs_genes

    for (method in pas_methods){
        
        if (method == 'GSVA'){
            sce <- assay(adata_merged[genes_to_keep, ], assay_name)
        }else{
            sce <- assay(adata_merged[genes_to_keep, ], 'X')
        }

        # run PAS scoring
        print(paste0("Estimating Activity Scores of ", gs, " Pathways using ", method, " scoring method"))
        print("........")

        res <- calculate_PAS(
            sce,
            method,                         # tool name
            species = 'none',               # species; human or mouse
            pathway = 'none',               # abbreviation for pathway database
            gmt_file = gene_sets[[gs]],     # pathways in GMT format
            filter = F,                     # whether filtering for genes expressed in less than 5 percent cells
            gsvaPar = list(                 # Parameters for when method selected is GSVA
                maxDiff = as.logical(maxDiff), 
                kcdf = kcdf, 
                minSize = min_set_size, 
                maxSize = max_set_size
                ),
            
            normalize = normalization_methods[[method]],
            n_cores = n_cores,
            rand_seed = 12345
        )

        pas_scores[[gs]][[method]] <- as.data.frame(res)
        pas_object[[gs]][[method]] <- as.SingleCellExperiment(prepare_vis(pas_scores[[gs]][[method]]))

        colData(pas_object[[gs]][[method]]) <- DataFrame(
                merge(as.data.frame(colData(pas_object[[gs]][[method]])), 
                    as.data.frame(colData(adata_merged)), 
                    by=0, 
                    row.names=1)
        rownames(colData(pas_object[[gs]][[method]])) <- colData(pas_object[[gs]][[method]])$Row.names

        )         
    }
}

[1] "Estimating Activity Scores of gabitto Pathways using AUCell scoring method"
[1] "........"
Do not filter genes.
[1] "normalize error"
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 150897
Number of edges: 4437850

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9422
Number of communities: 25
Elapsed time: 120 seconds
[1] "Estimating Activity Scores of galea Pathways using AUCell scoring method"
[1] "........"
Do not filter genes.
[1] "normalize error"
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 150897
Number of edges: 4423655

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9483
Number of communities: 32
Elapsed time: 127 seconds
[1] "Estimating Activity Scores of syngo Pathways using AUCell scoring method"
[1] "........"
Do not filter genes.
[1] "normalize error"
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of no

Read 129 records
Read 8627 items
Centering and scaling data matrix
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
Computing nearest neighbor graph
Computing SNN
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Read 215 records
Read 3223 items
Centering and scaling data matrix
Computing nearest neighbor graph
Computing SNN
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Read 267 records
Read 4147 items
  The following gene sets will be excluded from the analysis(less than 20% of their genes are available):
maintenance of alignment of postsynaptic density and presynaptic active zone (GO:0099559), postsynaptic spectrin-associated cytoskeleton organization (GO:0099190), presynaptic 

#### **Save Pathway Activity Scores**

In [39]:
%%R

if (!dir.exists(paste0(data_dir, "results/PAS/"))) {
  dir.create(paste0(data_dir, "results/PAS/"))
}

saveRDS(pas_object, paste0(data_dir, "results/PAS/", "all_PAS.rds"))

In [41]:
for gs in curated_gene_set_db.names:
    for method in pas_methods:
        pas_object[gs][method].write_h5ad(os.path.join(data_dir,  "results", "PAS", f"{gs}_geneset_{method}_PAS.h5ad"),
                              compression='gzip')