In [1]:
import os
import rpy2
import logging
import warnings
import anndata
import anndata2ri
import pandas as pd
import scanpy as sc
import numpy as np
import decoupler as dc
from anndata import AnnData
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from functions import pathway_analyses
from sklearn.model_selection import train_test_split


In [2]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
anndata2ri                  1.1
appnope                     0.1.3
asttokens                   NA
backcall                    0.2.0
cffi                        1.15.1
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
decoupler                   1.4.0
executing                   1.2.0
functions                   NA
google                      NA
gsva_prep                   NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2                      3.1.2
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.39.1
louvain                     0.8.0
markupsafe 

In [3]:
%%R
suppressPackageStartupMessages({
    library(WGCNA)
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(readxl)
    library(conflicted)

    # single-cell analysis package
    library(Seurat)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)


    library(GSEABase)
    library(GSVA) 
# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
# enableWGCNAThreads(nThreads = 4)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

1: package ‘AnnotationDbi’ was built under R version 4.3.1 
2: package ‘IRanges’ was built under R version 4.3.1 
3: package ‘S4Vectors’ was built under R version 4.3.1 
4: package ‘GSVA’ was built under R version 4.3.1 


### **Prepare data**

load the preprocessed and annotated data for differential pathway analysis.

Make sure the anndata has the count data in either `.layers` or in `.X`. if count data is not in `.layers` it will be assumed that `.X ` contains the counts

In [4]:
save_prefix = 'leng_etc' # this takes the format '{StudyName}_{ThreeLetterAccronymForBrainRegion}'
get_cell_types = False   # whether to reformat cell_type annotation

adata_annot = sc.read_h5ad(f'../data/raw/{save_prefix}/{save_prefix}_raw_anndata.h5ad')
adata_annot.obs_names_make_unique()
adata_annot.var_names_make_unique()

if 'counts' not in adata_annot.layers.keys():
    print('"counts" not in layers...')
    print('analysis requires unnormalized count data...')
    print('fetching count data from "adata.X"...')
    adata_annot.layers['counts'] = adata_annot.X.copy()
else:
    adata_annot.X = adata_annot.layers['counts'].copy()
    
del adata_annot.obsm, adata_annot.layers, adata_annot.varm, adata_annot.uns, adata_annot.obsp

adata_annot

"counts" not in layers...
analysis requires unnormalized count data...
fetching count data from "adata.X"...


AnnData object with n_obs × n_vars = 42528 × 33694
    obs: 'SampleID', 'PatientID', 'BrainRegion', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.clusters', 'clusterAssignment', 'clusterCellType', 'cell_type'

In [5]:
adata_annot.obs.cell_type.value_counts()

cell_type
Excitatory         10780
Oligodendrocyte     9615
Microglia           6003
Astrocyte           5916
Inhibitory          5571
OPC                 3913
Endothelial          730
Name: count, dtype: int64

Specify the way to map cells to appropriate cell-types

In [6]:
celltypes = ["Excitatory", "Inhibitory", "Astrocyte", "Microglia", "Oligodendrocyte", "OPC", 'Endothelial'] 

mapping = {'leng_etc':
           
           {'Exc': 'Excitatory', 
            'Inh': 'Inhibitory', 
            'Astro': 'Astrocyte',
            'Endo': 'Endothelial', 
            'Micro': 'Microglia', 
            'OPC': 'OPC', 
            'Oligo': 'Oligodendrocyte'},

           'leng_sfg':
           
           {'Exc': 'Excitatory', 
            'Inh': 'Inhibitory', 
            'Astro': 'Astrocyte',
            'Endo': 'Endothelial', 
            'Micro': 'Microglia', 
            'OPC': 'OPC', 
            'Oligo': 'Oligodendrocyte'},
            
            'seaad_mtg':
           
           {'Excitatory': 'Excitatory', 
            'Inhibitory': 'Inhibitory',
            'Astrocyte': 'Astrocyte',
            'Microglia': 'Microglia', 
            'Endothelial': 'Endothelial', 
            'OPC': 'OPC',
            'Oligodendrocyte': 'Oligodendrocyte'},
            }

cell_column = {'leng_etc': 'clusterCellType',
               'leng_sfg': 'clusterCellType',
               'seaad_mtg': 'cell_type',
               }

### **Additional Parameters**

In [7]:
control_group = 'no'                        # name of the control group in metadata 
map_meta = True

### There are different pseudobulking strategies
pseudobulking_strategy = 'blanchardbulk'      # options ['blanchardbulk', 'metacell', 'standardbulk'] See details in subsequent sections

pseudobulk_method = 'agg_x_norm'            # Only required if `pseudobulking_strategy=='custombulk'`` method for aggregrating cells from replicates into pseudobulks. ['agg_x_norm', 'norm_x_agg']
gene_selection = 'fraction'                 # Only required if `pseudobulking_strategy=='metacell'``. specifies the gene selection method when setting up seurat object for WGCNA. 
geneSet = ''                                # Only required if `pseudobulking_strategy=='metacell'`` and `gene_selction='custom'`

technical_covariate = 'SampleBatch'         # Technical covariate to be included as random effect not of interest (regressed out by duplicatecorrection)
duplicate_correction = 'FALSE'              # whether to run duplicate corretion in Limma to block out technical_covariate
gene_celltype_threshold = 0.10              # determines number of cells the gene must be expressed in
filter_genes_from = 'pseudobulk'            # whether to filter genees after aggregating pseudobulks or from single_cell object 
pathway_gene_threshold = 0.33               # determines number of genes that must be present in that pathway


metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv' # Metatdata location
meta = pd.read_csv(metadata, encoding_errors='ignore')

test_names = ['early_vs_no', 'late_vs_early', 'late_vs_no', 'ad_vs_no'] # test categories

covaraites_for_study = {'leng_sfg': ['None'],  # ['ageDeath.cat',],
                        'leng_etc': ['None'],  # ['ageDeath.cat',]
                        'seaad_mtg': ['None'],
                        }

subject_ids_for_study = {'leng_sfg': 'PatientID',
                        'leng_etc': 'PatientID',
                        'seaad_mtg': 'individualID',}

bio_covariates = covaraites_for_study[save_prefix]  # list of covariates to be accounted for in regression.
subject_id = subject_ids_for_study[save_prefix]     # for leng this is `PatientID` for mathys is 'Subject', and allen is 'individualID'



In [8]:
if get_cell_types:
    adata_annot.obs['cell_type'] = adata_annot.obs[cell_column[save_prefix]].map(mapping[save_prefix])

adata_annot = adata_annot[adata_annot.obs.cell_type.isin(celltypes)]

# map the pathology group to teh subject id in .obs
adata_annot = adata_annot[adata_annot.obs.cell_type.isin(celltypes)]
adata_annot.obs['pathology.group'] = adata_annot.obs[subject_id].map(dict(zip(meta[subject_id].astype(str), meta['pathology.group'])))

### **Send parameters to R interface with rpy2**

In [9]:
%%R -i adata_annot -i subject_id -i gene_celltype_threshold -i celltypes -i test_names -i save_prefix -i gene_selection

print(adata_annot)
print('loaded data into memory for recursive use')

class: SingleCellExperiment 
dim: 33694 42528 
metadata(0):
assays(1): X
rownames(33694): RP11-34P13.3 FAM138A ... AC213203.1 FAM231B
rowData names(0):
colnames(42528): EC2_AAACCTGAGGATGCGT EC2_AAACCTGAGTCAATAG ...
  EC10_TTTGTCATCTATCGCC EC10_TTTGTCATCTCTGCTG
colData names(13): SampleID PatientID ... cell_type pathology.group
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
[1] "loaded data into memory for recursive use"


## **2.4 Systematic differential analysis of pathway activity**

Pathway activity scores were computed in accordance with protocols outlined in [**Joel W. Blanchard et. al.**](https://doi.org/10.1038/s41586-022-05439-w).


We perform differential pathway activity analysis using pathways obtained from [**Gene Ontology biological processes (2018 edition**](https://maayanlab.cloud/Enrichr/#libraries).

### **Load and Process Pathways**

In [10]:
bp = pathway_analyses.read_pathways('../data/pathway_databases/GO_Biological_Process_2018.txt')

go_bp_paths = bp.set_index(0)
go_bp_paths.fillna("", inplace=True)
go_bp_paths_dict = go_bp_paths.to_dict(orient='index')


gene_set_by_path = {key: [val for val in value.values() if val != ""] for key, value in go_bp_paths_dict.items()}
gene_set_by_path = pd.DataFrame.from_dict(gene_set_by_path, orient='index').transpose()

  return pd.read_csv(filename, header=None, delimiter="\t", names=column_names)


### **Pseudobulk Aggregation and Filtering**


Briefly, we first computed cell-type-level normalized gene expression profiles for each individual using the ACTIONet normalization procedure (Mohammadi et al., 2020). We additionally provide other `pseudobulking_strategies`, using the `metacell` approach in the `hdWGCNA`, or the standard pseudobulk (`standardbulk`) method provided in custom script.

#### If **`pseubulking_strategy == 'blanchardbulk'`**

In [11]:
if pseudobulking_strategy=='blanchardbulk':

    adata_annot.layers['counts'] = adata_annot.X.copy()
    expressed_genes_per_celltype = pathway_analyses.filter_expressed_genes_by_celltype(adata_annot, threshold=gene_celltype_threshold, 
                                                                                    filter_genes_from = filter_genes_from,
                                                                                    subject_id = subject_id)

    avs_logcounts_cellxind  = pathway_analyses.get_ind_level_ave(adata_annot, subject_id, pseudobulk_method, 
                                                                expressed_genes_per_celltype, filter_genes_at_threshold=True)

del adata_annot

#### If **`pseubulking_strategy == 'metacell'` (TIME CONSUMING)**

In [12]:
%%R -i pseudobulking_strategy -o meta_obj -o expressed_genes -i geneSet

if (pseudobulking_strategy == 'metacell'){
  seurat_obj <- as.Seurat(adata_annot, counts = "X", data = "X")

  print(seurat_obj)
  # Perform dimensionality reduction and plot

  seurat_obj <- FindVariableFeatures(seurat_obj)
  seurat_obj <- ScaleData(seurat_obj)
  seurat_obj <- RunPCA(seurat_obj)
  seurat_obj <- RunHarmony(seurat_obj, group.by.vars = subject_id)
  seurat_obj <- RunUMAP(seurat_obj, reduction='harmony', n.neighbors=15, dims=1:30, min.dist=0.1)

  # create a hdWGCNA experiment for each celltype
  seurat_dat <- seurat_obj
  meta_obj <- list()
  expressed_genes <- list()

  for (cur_cell_type in celltypes){

    seurat_obj <- subset(seurat_dat, cell_type == cur_cell_type)

    print(paste0('Creating hdWGNA Experiment for ', toupper(cur_cell_type)))

    if (gene_selection == 'custom') {
      seurat_obj <- SetupForWGCNA(
        seurat_obj,
        gene_select = "custom",                                            # the gene selection approach
        gene_list = as.vector(geneSet),    # list of genes to be included
        group.by = 'cell_type',                                            # grouping parameter
        wgcna_name = toupper(substr(cur_cell_type, 1, 3))                  # the name of the hdWGCNA experiment
      )
    } else {
      seurat_obj <- SetupForWGCNA(
        seurat_obj,
        gene_select = "fraction",                               # the gene selection approach
        fraction = gene_celltype_threshold,                     # fraction of cells for gene inclusion
        group.by = 'cell_type',                                 # grouping parameter
        wgcna_name = toupper(substr(cur_cell_type, 1, 3))       # the name of the hdWGCNA experiment
      )
    }

    print(paste0('Constructing MetaCells in hdWGCNA Experiment for ', toupper(cur_cell_type)))
    
    seurat_obj <- MetacellsByGroups(
            seurat_obj = seurat_obj,
            group.by = c("cell_type", subject_id), # specify the columns in seurat_obj@meta.data to group by
            reduction = 'harmony',  # select the dimensionality reduction to perform KNN on
            k = 25, # nearest-neighbors parameter
            max_shared = 10, # maximum number of shared cells between two metacells
            ident.group = 'cell_type', # set the Idents of the metacell seurat object
            wgcna_name = toupper(substr(cur_cell_type, 1, 3)),        # the name of the hdWGCNA experiment
            )

    # normalize metacell expression matrix:
    seurat_obj  <- NormalizeMetacells(seurat_obj, wgcna_name = toupper(substr(cur_cell_type, 1, 3)))

    meta_obj[[cur_cell_type]] <- as.SingleCellExperiment(GetMetacellObject(seurat_obj,
                                                      wgcna_name = toupper(substr(cur_cell_type, 1, 3))))   

     expressed_genes[[cur_cell_type]] <- GetWGCNAGenes(seurat_obj)                                       
  }

  rm(seurat_obj)
  rm(seurat_dat)
  rm(adata_annot)
} else {
  meta_obj <- NULL
  expressed_genes <- NULL
}


In [13]:
if pseudobulking_strategy=='metacell':
    
    max([len(expressed_genes[v]) for v in expressed_genes.keys()])
    expressed_genes = {k: list(v) + [np.nan]*(max_length - len(v)) for k, v in expressed_genes.items()}
    expressed_genes_per_celltype = pd.DataFrame(expressed_genes, columns=expressed_genes.keys())

    avs_logcounts_cellxind = {}
    for cell_type in celltypes:
        avs_logcounts_cellxind[cell_type] = pd.DataFrame(meta_obj[cell_type].layers['logcounts'].toarray().T, 
                                                        columns=meta_obj[cell_type].obs_names, 
                                                        index=meta_obj[cell_type].var_names)



#### If **`pseubulking_strategy == 'standardbulk'`**

In [14]:
%%R -i pseudobulking_strategy -o pseudobulk_data

if (pseudobulking_strategy == 'standardbulk'){

    library(scuttle)
    library(Matrix)
    library(ensembldb)
    library(EnsDb.Hsapiens.v86)
    conflicts_prefer(GenomicRanges::setdiff)

    source("../scripts/functions/sconline_code.R")

    min_size_limit = 15     # minimum acceptable size of the pseudocells. usually 10 or 15.
    nPCs = 30               # Number of pcs to use to create the similarity network. Used only if human or mouse
    organism = 'Human'

    #generating the embedding space

    # exp_seurat = .extraExport2SeuratFn(adata_annot) %>%
    # Seurat::NormalizeData() %>%
    #     FindVariableFeatures() %>% 
    #     ScaleData() %>% 
    #     RunPCA(verbose=F)
  assay(adata_annot, 'counts') = assay(adata_annot, 'X')
  pseudobulk_data = suppressWarnings(.sconline.PseudobulkGeneration(argList = NULL, 
                                  # The columns in the pheonData that will be used to parse the expression data 
                                  # and generate the pseudocell/pseudobulk data
                                    parsing.col.names = c(subject_id, 'cell_type'), 
                                  # average pseudocell size.
                                    pseudocell.size = NULL,
                                    inputExpData = adata_annot,
                                  # minimum acceptable size (ie, #cells) for each pseudobulk
                                    min_size_limit = min_size_limit,
                                  # in case we want to run the function outside sconline space
                                    inputPhenoData = as.data.frame(colData(adata_annot)),
                                  # the embedding space to be used for the generation of the pseudobulk.
                                  # only needed when pseudocell.size is not null
                                  # inputEmbedding = embedding_data, 
                                  # the dimension of the embedding space for the construction of pseudobulk data
                                    nPCs = nPCs, 
                                    ncores = 3,
                                    rand_pseudobulk_mod = F,
                                    organism = organism))

} else {
  pseudobulk_data <- NULL
}

##### **Get pathways**

**Obtain filtered genesets**

In [15]:
gene_set_per_celltype = pathway_analyses.filter_lowly_exp_genes(expressed_genes_per_celltype, gene_set_by_path, 
                                                                threshold=pathway_gene_threshold)

#### **Visualize Covariates on UMAP**

In [16]:
# adatas = []

# for cell_type in celltypes:
#     temp = AnnData(X=avs_logcounts_cellxind[cell_type].T)
#     temp.var_names = list(np.arange(0, len(avs_logcounts_cellxind[cell_type].index)))
#     temp.obs_names = avs_logcounts_cellxind[cell_type].columns
#     temp.obs['cell_type'] = cell_type
#     temp.obs[subject_id] = avs_logcounts_cellxind[cell_type].columns
#     for covariate in bio_covariates:
#         temp.obs[covariate] = temp.obs[subject_id].astype(str).map(dict(zip(meta[subject_id].astype(str), meta[covariate])))
#     temp.obs_names = [f'{cell_type}_{x}' for x in avs_logcounts_cellxind[cell_type].columns]
#     adatas.append(temp.copy())
#     #del temp

# adata_concat = anndata.concat(adatas, join='outer')
# sc.pp.pca(adata_concat)

# sc.set_figure_params(scanpy=True,
#                     dpi_save=300, frameon=False, 
#                     vector_friendly=True, fontsize=14, figsize=(7, 5), 
#                     color_map=None, format='pdf', facecolor=None,
#                     transparent=False, ipython_format='png2x')
                     
                     
# #pdata.obs['pmi'] = pdata.obs['pmi'].astype('float')

# dat_dir = f'../results/Covariates/'

# if not os.path.exists(dat_dir):
#     os.makedirs(dat_dir)

# if not os.path.exists('figures/'):
#     os.makedirs("figures/")

# sc.pl.pca(adata_concat[adata_concat.obs.cell_type.isin(celltypes)], color=['cell_type'] + bio_covariates,
#          ncols=3, size=300, save=f'_{save_prefix}_covariates.pdf', components=['1,2'])

# os.rename(f'../scripts/figures/pca_{save_prefix}_covariates.pdf',
#          f'../results/Covariates/{save_prefix}_covariates.pdf')


### **Prep data for Gene Set Variation Analysis in R**

In [17]:
for cell_type in celltypes:
    prep_gsva(avs_logcounts_cellxind[cell_type], gene_set_per_celltype[cell_type], 
              tempdir=f'../results/gsva/{save_prefix}/{cell_type}/', verbose=False)

### **Estimate Pathway Activity Scores**

After estimating individual-cell-type-level normalized gene expression profile averages, we next calculate pathway activity scores, as described in [**Joel W. Blanchard et. al.**](https://doi.org/10.1038/s41586-022-05439-w) and previously implemented in the R package (Gene Set Variational Analysis) GSVA (v.1.42.0)41. 

Briefly, GSVA estimates a normalized relative expression level per gene across samples. This expression level is then rank-ordered for each sample and aggregated into gene sets by calculating sample-wise enrichment scores using a Kolmogorov–Smirnov-like rank statistic.

As described in [**Joel W. Blanchard et. al.**](https://doi.org/10.1038/s41586-022-05439-w), we used the parameters to evaluate the 
GSVA function:

- `mx.diff=TRUE` 
- `kcdf=c("Gaussian")`
- `min.sz=5` 
- `max.sz=150`

Here, we instead set `max.sz=500`, as described in the [**GSVA Tutorial**](http://www.bioconductor.org/packages/release/bioc/vignettes/GSVA/inst/doc/GSVA.html#5_Quantification_of_pathway_activity_in_bulk_microarray_and_RNA-seq_data)

In [18]:
%%R -o gsva_scores -i save_prefix -i celltypes

library(GSEABase)
library(GSVA)
library(tidyr)
library(dplyr)

# create an empty list to store the output for each celltype
gsva_scores = list()

# iterate over celltypes and perform gsva
for (celltype in celltypes) {
    # get gene sets
    geneSets = getGmt(file.path('../results/gsva/', save_prefix, '/', celltype, 'gs.gmt'))
    geneSets = geneIds(geneSets)

    # read in expression data
    mat = as.matrix(read.csv(file.path('../results/gsva/', save_prefix, '/', celltype, 'expr.csv'), header=TRUE, row.names=1, check.names=FALSE))
    
    # run gsva
    print(paste0("Estimating Pathway Activity Scores in ", celltype, " Cells"))
    print("........")
    exc_gsva = gsva(mat, geneSets, mx.diff=TRUE, kcdf=c("Gaussian"), min.sz=5, max.sz = 500)
    
    # Convert matrix to data frame and gather into "long" format
    exc_gsva <- as.data.frame(exc_gsva, stringsAsFactors = FALSE)

    # store the output in a list
    gsva_scores[[celltype]] = exc_gsva

    write.csv(gsva_scores[[celltype]], file = file.path('../results/gsva/', save_prefix, '/', celltype, 'gsva_scores.csv'))
}


[1] "Estimating Pathway Activity Scores in Excitatory Cells"
[1] "........"
Estimating GSVA scores for 2936 gene sets.
Estimating ECDFs with Gaussian kernels

[1] "Estimating Pathway Activity Scores in Inhibitory Cells"
[1] "........"
Estimating GSVA scores for 2950 gene sets.
Estimating ECDFs with Gaussian kernels

[1] "Estimating Pathway Activity Scores in Astrocyte Cells"
[1] "........"
Estimating GSVA scores for 2152 gene sets.
Estimating ECDFs with Gaussian kernels

[1] "Estimating Pathway Activity Scores in Microglia Cells"
[1] "........"
Estimating GSVA scores for 1781 gene sets.
Estimating ECDFs with Gaussian kernels

[1] "Estimating Pathway Activity Scores in Oligodendrocyte Cells"
[1] "........"
Estimating GSVA scores for 1711 gene sets.
Estimating ECDFs with Gaussian kernels

[1] "Estimating Pathway Activity Scores in OPC Cells"
[1] "........"
Estimating GSVA scores for 2138 gene sets.
Estimating ECDFs with Gaussian kernels

[1] "Estimating Pathway Activity Scores in Endothe

### **Differential pathway activity analysis**

To investigate differential pathway activity across different cell types, we use a multivariate linear model approach, as described in [**Joel W. Blanchard et. al.**](https://doi.org/10.1038/s41586-022-05439-w), using pathway activity scores. 

The design matrix will appear as consequently: 

`pathway activity ~ β0 × AD + β1 × Coavriate1 + β2 × Covariate2 + ...`

The linear models were fitted using the `lmfit()` function and t-statistics were computed using the `eBayes()` function from the Limma R package (v.3.50.3). 

To prioritize candidate AD-dysregulated processes, pathways with snRNA-seq evidence of association with AD at a `nominal P value of 0.05` are considered to be potential candidates. This procedure will result in prioritized candidate pathways for all cell types. 

The linear models were fitted using the lmfit() function and t-statistics were computed using the eBayes() function from the Limma R package (v.3.50.3).

In [19]:
%%R -o all_data -i control_group -i metadata -i subject_id -i bio_covariates -i save_prefix -i technical_covariate -i duplicate_correction -i test_names

library(limma)
library(magrittr)
library(tidyr)
conflicts_prefer(base::unname)

source('../scripts/functions/pathway_analyses.R')

summary = read.csv(metadata, check.names = FALSE)
#control_group = paste(control_group, 'pathology', sep='-')
summary$AD = ifelse(summary$pathology.group == control_group, 0, 1)
rownames(summary) = summary[, subject_id]



gsva_scores = list()
# iterate over celltypes and perform limma
for (celltype in celltypes) {

    # read in pathway scores
    pathway_scores = as.matrix(read.csv(file.path('../results/gsva/', save_prefix, '/', celltype, 'gsva_scores.csv'), header=TRUE, row.names=1, check.names=FALSE))

    # store the output in a list
    gsva_scores[[celltype]] <- t(pathway_scores)
} 

all_data = list()

all_data[['gsva_out']] = gsva_scores

# get linear model fits
print('getting linear model fits...')
fits = get_fits(gsva_scores, summary, bio_covariates, technical_covariate)
all_data[['fits_all']] = fits

# get matrix of scores for heatmap
print('get matrix of scores')
scores = get_scores(fits)

# Create a new empty list to store the reorganized scores
new_scores <- list()

# Loop over the different tests
tests <- test_names
for (test in tests) {
  
  # Create a new empty list for this test
  new_scores[[test]] <- list('all' = list())
  
  # Loop over the different cell types
  for (cell_type in celltypes) {
    
    # Copy the scores for this cell type and test
    new_scores[[test]][['all']][[cell_type]] <- scores[['all']][[cell_type]][[test]]
    
  }
}

# save new_scores in list
all_data[['scores_all']] = new_scores

print('filter by score 1.3')
# Extract unique gene names from scores object
all_data[['scores_filtered']] = list()

for(i in names(new_scores)){
    names = unique(unname(unlist(lapply(names(new_scores[[i]]$all), function(x) rownames(new_scores[[i]]$all[[x]])))))

    # Get a matrix of gene set scores for all samples
    mat = get_matrix(new_scores[[i]]$all, names)
    
    # Keep only rows with absolute values > 1.3 in at least one sample
    mat = mat[unname(rowSums(abs(mat)>1.3)>0), ]
    
    # if(top_20==TRUE){
    #     index = unique(unname(unlist(lapply(colnames(mat), function(x) order(abs(mat[[x]]),decreasing = T)[1:20]))))
    #     mat = mat[index,]
    # }
    all_data[['scores_filtered']][[i]] = mat
}


[conflicted] Will prefer base::unname over any other package.
[1] "getting linear model fits..."
[1] "get matrix of scores"
[1] "filter by score 1.3"


In [None]:
valid_pathways = {}
all_pathways = {}

for test_name in test_names:
    valid_pathways[test_name] = {}
    all_pathways[test_name] = pd.DataFrame()
    # Loop through each unique cell type in the cell_type column of adata_annot and filter statistically significant pathways
    for cell_type in celltypes:
        try:
            # Create a dataframe for the current cell type from a matrix in all_data, rename and set index.
            valid_pathways[test_name][cell_type] = pd.DataFrame(all_data['scores_filtered'][test_name][cell_type])
            all_data['scores_all'][test_name]['all'][cell_type].index.name = 'pathway'
            valid_pathways[test_name][cell_type].index.name = 'pathway'
            valid_pathways[test_name][cell_type].rename(columns={cell_type: 'score_adj'}, inplace=True)

            # Merge current cell type dataframe with its corresponding scores_all dataframe on "pathway" and remove missing rows
            valid_pathways[test_name][cell_type] = valid_pathways[test_name][cell_type].merge(all_data['scores_all'][test_name]['all'][cell_type],
                                                     how='outer', on='pathway')
            valid_pathways[test_name][cell_type].dropna(axis=0, inplace=True)

            # Filter rows with P.Value less than 0.05 from the current cell type dataframe and sort by P.Value    
            # valid_pathways[test_name][cell_type] = valid_pathways[test_name][cell_type][valid_pathways[test_name][cell_type]['P.Value']<0.05]
            valid_pathways[test_name][cell_type].sort_values(by='P.Value', inplace=True)
            
            # Append the current cell type's dataframe to the all_pathways dataframe and sort by pathway
            all_pathways[test_name] = pd.concat([all_pathways[test_name], valid_pathways[test_name][cell_type]], axis=0)
            all_pathways[test_name].sort_values(by='pathway', inplace=True)
        except KeyError:
            continue

### Renaming convention

In [None]:
# Read in a CSV file containing the renaming convention for pathway names
# Remove duplicate names in the renaming convention dataframe
renaming_convention = pd.read_csv('../data/pathway_databases/renaming_convention.csv', dtype=str)
renaming_convention.drop_duplicates(subset='names', inplace=True)

for test_name in test_names:
    # Reset the index of the all_pathways dataframe to allow merging with renamed convention
    all_pathways[test_name].reset_index(inplace=True)
    all_pathways[test_name] = pd.merge(all_pathways[test_name], renaming_convention[['names', 'shortened', 'highlight']], how='left', on='names')
    all_pathways[test_name].set_index(keys='pathway', inplace=True)
    all_pathways[test_name] =  all_pathways[test_name].astype(object).replace(np.nan, 'None')


    dat_dir = f'../results/{test_name}/{save_prefix}/Data/'

    if not os.path.exists(dat_dir):
        os.makedirs(dat_dir)

    # Write the all_pathways dataframe to a CSV file
    all_pathways[test_name].to_csv(dat_dir+f'differentially_expressed_pathways.csv')

### Preliminary Heat Map Visualization

Plot Select Cell-type-specific Pathways

In [None]:
# for test_name in test_names:
#     print(f'heatmap for select cell-type-specific pathways in {test_name} test')
#     list_of_paths_to_annotate = list(all_pathways[test_name].shortened[all_pathways[test_name].highlight=='yes'])
#     df = all_pathways[test_name][all_pathways[test_name]['P.Value']<0.05]

#     fig_dir = f'../results/{test_name}/{save_prefix}/HeatMap/'
#     if not os.path.exists(fig_dir):
#         os.makedirs(fig_dir)
        
#     save_path = fig_dir+'_cell_type_specific_diff_exp_paths.pdf'

#     pathway_analyses.plot_and_select_top_deps(df, list_of_paths_to_annotate, save_prefix=save_prefix,
#                          save_path=save_path, filter=True, cell_type_specific=True, 
#                          test_name=test_name, cell_types=celltypes)

Plot Select Shared Pathways

In [None]:
# for test_name in test_names:
#     print(f'heatmap for select shared pathways in {test_name} test')
#     list_of_paths_to_annotate = list(all_pathways[test_name].shortened[all_pathways[test_name].highlight=='yes'])
#     df = all_pathways[test_name][all_pathways[test_name]['P.Value']<0.05]


#     fig_dir = f'../results/{test_name}/{save_prefix}/HeatMap/'
#     if not os.path.exists(fig_dir):
#         os.makedirs(fig_dir)
        
#     save_path = fig_dir+'filtered_broad_diff_exp_paths.pdf'

#     pathway_analyses.plot_and_select_top_deps(df, list_of_paths_to_annotate, save_prefix=save_prefix,
#                          save_path=save_path, filter=True, cell_type_specific=False, 
#                          test_name=test_name, cell_types=celltypes)
