In [1]:
import os
import scvi
import scgen
import rpy2
import scib
import json
import torch
import anndata
import logging
import warnings
import scanorama
import anndata2ri
import matplotlib
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import scrublet as scr
import doubletdetection
import decoupler as dc
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from os import PathLike, fspath
import rpy2.robjects as robjects
from scipy.sparse import csr_matrix
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from rpy2.robjects.packages import importr
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)
  from scipy.sparse.base import spmatrix


In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()





-----
anndata     0.8.0
scanpy      1.9.1
-----
OpenSSL                     22.0.0
PIL                         9.2.0
absl                        NA
adjustText                  NA
anndata2ri                  1.1
annoy                       NA
appnope                     0.1.2
asttokens                   NA
astunparse                  1.6.3
attr                        21.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
boto3                       1.26.32
botocore                    1.29.32
bottleneck                  1.3.5
brotli                      NA
certifi                     2022.09.24
cffi                        1.15.1
chex                        0.1.5
cloudpickle                 2.2.0
colorama                    0.4.4
contextlib2                 NA
cryptography                38.0.1
cycler                      0.10.0
cython_runtime              NA
dask                        2022.11.0
dateutil                    2.8.2
debugpy    

In [4]:
%%R
suppressPackageStartupMessages({
    library(reticulate)
    library(ggplot2)
    library(tidyr)
    library(dplyr)
    library(purrr)
    library(Seurat)
    library(tibble)
    library(magrittr) 
    library(forcats)
    library(Matrix)
    library(stats)
    library(tester)
    library(Seurat)
    library(methods)
    library(matrixStats)
    library(edgeR)
    library(DESeq2)
    library(limma)
    library(pbmcapply)
    library(parallel)
    library(lmerTest)
    library(lme4)
    library(glmmTMB)
    library(blme)
# needs to be run every time you start R and want to use %>%
})



    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

## Table of contents:

  * <a href=#Reading>1. Reading in the data</a>
  * <a href=#Preprocessing>2. Systematic differential analysis of gene expression</a>

# **1. Reading in the data**

## **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

In [5]:
save_prefix = 'leng_etc'

adata_annot = sc.read_h5ad(f'../data/processed/{save_prefix}/{save_prefix}_mapped_anndata.h5ad')
adata_annot.obs_names_make_unique()
adata_annot.var_names_make_unique()
adata_annot.X = adata_annot.layers['counts'].copy()
del adata_annot.obsm, adata_annot.layers, adata_annot.varm, adata_annot.uns, adata_annot.obsp


In [6]:
adata_annot

AnnData object with n_obs × n_vars = 41432 × 16585
    obs: 'SampleID', 'PatientID', 'BrainRegion', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.clusters', 'clusterAssignment', 'clusterCellType', 'cell_type', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'predictions'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'

## **Prepare metadata**

Now we specify other related information

Specify the following:

- `metadata`: Path to metadata. Metadata must contain a column called `pathology.group` with the only unique groups being `no`, `early`, and `late`.

- `map_meta`: whether to map metadata to obtain `pathology.group`. If False, it will be assumed that `pathology.group` exist in `adata.obs`

- `test_names`: List of the different test names of interest.

- `save_prefix`: Prefix for saving critical files. preferably chosen to be in the format `{source name}_{brain region}`. e.g `mathys_pfc`

- `subject_id`: Name of the column containing Subject/Patient ID in metadata and `.obs`

- `covariates`: This should be a list of additional confounding covariates (`not including pathology.group`) e.g `Sex`, `Sample Batch`, `Age` and other factors that might not be of interest but might have an effect on the pathological status. If you would not like to include any cofounders, please set `covaraites = ['None']`.

- `filter_genes`: Specifies whether to filter genes using `gene_celltype_threshold` before before performing differential expression tests`

        

In [7]:
map_meta = True
filter_genes = "TRUE"
subject_id = 'PatientID'                # for leng this is `PatientID` for mathys is 'Subject', and 'individualID' for allen
gene_celltype_threshold = 0.05          # determines number of cells the gene must be expressed in 
covariates = ['None']                   # list of covariates to be accounted for in regression.
test_names = ['late_vs_early', 'early_vs_no', 'late_vs_no', 'ad_vs_no']
metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv' 


deg_methods_to_run = robjects.ListVector({
                                        'single_cell_methods': "FALSE",
                                        'pseudo_bulk_methods': "TRUE",
                                        'mixed_model_methods': "FALSE",
                                        })



## **Systematic differential analysis of gene expression**

[**Hansruedi Mathys et. al.**](https://doi.org/10.1038/s41586-019-1195-2) compared gene expression levels between `AD-pathology and no-pathology individuals in a cell type manner. The differential expression analysis was assessed using two tests. 

- **First**, a cell-level analysis was performed using the Wilcoxon rank-sum test and FDR multiple-testing correction (`FDR-adjusted p-values`). 

- **Second**, a Poisson mixed model accounting for the individual of origin for nuclei and for unwanted sources of variability was performed using the R packages `lme4` and `RUV-seq`, respectively.


However, extensive evidence suggests that approaches that take a cell-level view where cells are modeled/considered individually are often susceptible to pseudoreplication where inferential statistics is applied to biological replicates which are not statistically independent. As a result, failing to account for the inherent correlation of replicates (cells from the same individual) inflates the false discovery rate (FDR). [**Junttila et al. 2022**](https://doi.org/10.1093/bib/bbac286), [**Squair et al. 2021**](https://www.nature.com/articles/s41467-021-25960-2), [**Zimmerman et. al 2021**](https://www.nature.com/articles/s41467-021-21038-1)


Indeed, these studies suggest that, batch effect correction or the aggregation of cell-type-specific expression values within an individual through either a sum, mean or random effect per individual, that is `pseudobulk generation`, should be applied prior to DGE analysis to account for within-sample correlations [**Zimmerman et. al 2021**](https://www.nature.com/articles/s41467-021-21038-1). Generally, both, pseudobulk methods with sum aggregation such as edgeR, DESeq2, or Limma [**Ritchie et al., 2015**](https://academic.oup.com/nar/article/43/7/e47/2414268) and mixed models such as MAST with random effect setting were found to be superior compared to naive methods, such as the popular Wilcoxon rank-sum test or Seurat’s latent models, which do not account for them [**Junttila et al. 2022**](https://doi.org/10.1093/bib/bbac286).


## **Methods**

Here we implement a modified version of the R package, `Libra`, provided by [**Squair et al. 2021**](https://www.nature.com/articles/s41467-021-25960-2), which implementing all methods for DE analysis discussed in the study within a consistent interface. 
The source package is available from [GitHub](https://github.com/neurorestore/Libra) and as Supplementary Software. 

The source package implements a total of `22 unique differential expression methods` that can all be accessed from a single function. These methods encompass traditional single-cell methods as well as methods accounting for biological replicate including pseudobulk and mixed model methods. The code for this package has been largely inspired by the Seurat and Muscat packages. Please see the documentation of these packages for further information.

### **Usage**

The main function of `run_de` takes as input a preprocessed features-by-cells (e.g., genes-by-cells for scRNA-seq) matrix, and a data frame containing metadata associated with each cell, minimally including the cell type annotations, replicates, and sample labels to be predicted.

The methods impolemented in this package are summarized as follows:

__Single cell methods__

- Wilcoxon Rank-Sum test
- Likelihood ratio test
- Student's t-test
- Negative binomial linear model
- Logistic regression
- MAST

__Pseudobulk methods__
- edgeR-LRT
- edgeR-QLF
- DESeq2-LRT
- DESeq2-Wald
- limma-trend
- limma-voom

__Mixed model methods__
- Linear mixed model
- Linear mixed model-LRT
- Negative binomial generalized linear mixed model
- Negative binomial generalized linear mixed model-LRT
- Negative binomial generalized linear mixed model with offset
- Negative binomial generalized linear mixed model with offset-LRT
- Poisson generalized linear mixed model
- Poisson generalized linear mixed model-LRT
- Poisson generalized linear mixed model with offset
- Poisson generalized linear mixed model with offset-LRT

By default the pipeline will use a pseudobulk approach, implementing the `edgeR` package with a likelihood ratio test (LRT) null hypothesis testing framework. Each of the 22 tests can be accessed through three key variables of the `run_de` function: `de_family`, `de_method`, and `de_type`. Their precise access arguments are summarized in the below table.

| Method | de_family | de_method | de_type |
|--------|-----------|-----------|---------|
Wilcoxon Rank-Sum test | singlecell | wilcox | |
Likelihood ratio test | singecell | bimod | |
Student's t-test | singlecell | t | |
Negative binomial linear model | singlecell | negbinom | |
Logistic regression | singlecell | LR | |
MAST | singlecell | MAST | |
edgeR-LRT | pseudobulk | edgeR | LRT
edgeR-QLF | pseudobulk | edgeR | QLF
DESeq2-LRT | pseudobulk | DESeq2 | LRT
DESeq2-Wald | pseudobulk | DESeq2 | Wald
limma-trend | pseudobulk | limma | trend
limma-voom | pseudobulk | limma | voom
Linear mixed model | mixedmodel | linear | Wald
Linear mixed model-LRT | mixedmodel | linear | LRT
Negative binomial generalized linear mixed model | mixedmodel | negbinom | Wald
Negative binomial generalized linear mixed model-LRT | mixedmodel | negbinom | LRT
Negative binomial generalized linear mixed model with offset | mixedmodel | negbinom_offset | Wald
Negative binomial generalized linear mixed model with offset-LRT | mixedmodel | negbinom_offset | LRT
Poisson generalized linear mixed model | mixedmodel | poisson | Wald
Poisson generalized linear mixed model-LRT | mixedmodel | poisson | LRT
Poisson generalized linear mixed model with offset | mixedmodel | poisson_offset | Wald
Poisson generalized linear mixed model with offset-LRT | mixedmodel | poisson_offset | LRT

In [8]:
single_cell_methods = ['wilcox', 'MAST']
pseudo_bulk_methods = ['edgeR-QLF', 'DESeq2-Wald', 'limma-voom']
mixed_model_methods = ['poisson-LRT', 'negbinom-LRT']

## **Consensus of Differential Expressed Transcripts**

Considering these models have different specificity, we apply of the methods and genes that re,

As in [**Hansruedi Mathys et. al.**](https://doi.org/10.1038/s41586-019-1195-2), the consistency of DEGs detected using the  `single-cell`, `mixed-model`, and the `pseudobulk` approaches will be assessed by comparing the directionality and rank of DEGs across all the methods. Considering these models have different specificity, we apply all of the methods. In [**Hansruedi Mathys et. al.**](https://doi.org/10.1038/s41586-019-1195-2) consistency in directionality was measured by counting the fraction of the top 1,000 DEGs (ranked by FDR scores) detected in cell-level analysis that showed consistent direction in the deep generative/mixed model.

For analyses involving DEG counts, we used only genes that were significantly supported by all methods using the criteria

- `FDR-corrected P < 0.05 in a pseudobulk test`,
- `FDR-corrected P < 0.05 in a single-cell test`,
- `FDR-corrected P < 0.05 in the Poisson mixed model` 
- `absolute log2(mean gene expression in AD category x/mean gene expression in AD category y) > 0.25`,


when comparing  `AD-pathology` group to `no-pathology` such that the log foldchange is ;

$$ Log_{2} ({Mean\ Gene\ Expression\ in\ AD\ category\ of\ Cell\ Type\ x \over Mean\ Gene\ Expression\ in\ Normal\ category\ of\ Cell\ Type\ x})$$

## **Map Metadata**

In [9]:
meta = pd.read_csv(metadata)
meta = meta.astype(str)
mapping = dict(zip(meta[subject_id], meta['pathology.group']))
adata_annot.obs['pathology.group'] = adata_annot.obs[subject_id].map(mapping)

## **Loading data into memory**

In [10]:
pdata = dc.get_pseudobulk(adata_annot,
                          sample_col=subject_id,
                          groups_col='cell_type',
                          layer=None,
                          mode='sum',
                          min_cells=0,
                          min_counts=0
                         )
pdata

genes_to_keep = {}

celltypes = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Microglia', 'Oligodendrocyte', 'OPC']

for cell_type in celltypes:
    print(f'filtering genes in {cell_type}...')
    genes_to_keep[cell_type] = dc.filter_by_prop(pdata[pdata.obs['cell_type'] == cell_type].copy(), min_prop=gene_celltype_threshold)

genes_to_keep_list = robjects.ListVector(genes_to_keep)

del pdata

filtering genes in Excitatory...
filtering genes in Inhibitory...
filtering genes in Astrocyte...
filtering genes in Microglia...
filtering genes in Oligodendrocyte...
filtering genes in OPC...


In [11]:
%%R -i adata_annot -i genes_to_keep_list -i deg_methods_to_run -i celltypes

print(adata_annot)

print('loaded data into memory for recursive use')



class: SingleCellExperiment 
dim: 16585 41432 
metadata(0):
assays(1): X
rownames(16585): SAMD11 NOC2L ... S100B PRMT2
rowData names(12): mt n_cells_by_counts ... mean std
colnames(41432): EC2_AAACCTGAGGATGCGT EC2_AAACCTGAGTCAATAG ...
  EC10_TTTGTCATCTATCGCC EC10_TTTGTCATCTCTGCTG
colData names(24): SampleID PatientID ... predictions pathology.group
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
[1] "loaded data into memory for recursive use"


#### **Pseduo-bulk Differential Expression Analysis with custom scripts adapted from [**Squair et al. 2021**](https://www.nature.com/articles/s41467-021-25960-2)**

In [12]:
%%R -i covariates -i test_names -i subject_id -i pseudo_bulk_methods -i gene_celltype_threshold -o pseudobulk_degs

library(scuttle)

source('../scripts/functions/deg_functions/run_de.R')
source('../scripts/functions/deg_functions/pseudobulk_de.R')
source('../scripts/functions/deg_functions/check_inputs.R')
source('../scripts/functions/deg_functions/to_pseudobulk.R')
source('../scripts/functions/deg_functions/mixedmodel_de.R')
source('../scripts/functions/deg_functions/singlecell_de.R')

# celltypes <- c('Excitatory', 'Inhibitory', 'Astrocyte', 'Microglia', 'Oligodendrocyte', 'OPC')

if ('None' %in% covariates){
    latent_vars = NULL
} else {
    latent_vars = covariates
}


if (eval(deg_methods_to_run[['pseudo_bulk_methods']])){ 
    pseudobulk_degs <- list()
    for (celltype in celltypes){

        pseudobulk_degs[[celltype]] <- list()

        # filter out celltypes
        print('--------------------------------')
        print(paste0('Estimating DEGs in ', celltype, '...'))
        sce_cell <- adata_annot[, adata_annot$cell_type == celltype]
        # print('----------------------------------')
        # print(paste0('Dimensions before filtering genes: ', 'n_genes: ',
        #      dim(assay(sce_cell))[1], ', n_cells: ', dim(assay(sce_cell))[2], '...'))

        # assay(sce_cell, 'counts') <- assay(sce_cell)
        
        # Aggregate across cluster-sample groups
        # aggr_counts <- aggregateAcrossCells(sce_cell, ids=colData(sce_cell)[, c(subject_id)],)

        # create an edgeR object with counts and grouping factor
        # y <- DGEList(assay(aggr_counts, "counts"), group = colnames(aggr_counts))
        # filter out genes with low counts
        # keep <- filterByExpr(y, min.prop = gene_celltype_threshold)
        # y <- y[keep, , keep.lib.sizes=FALSE]
        # Subset the `SingleCellExperiment` object to include only genes with > 0.1 detection rate/frequency
        sce_cell <- sce_cell[genes_to_keep_list[[celltype]], ]
                            
        # print('----------------------------------')
        # print(paste0('Dimensions after filtering genes: ', 'n_genes: ',
        #      dim(assay(sce_cell))[1], ', n_cells: ', dim(assay(sce_cell))[2], '...'))

        # assay(sce_cell) <- assay(sce_cell, "counts")
        meta = colData(sce_cell)

        for (test_name in test_names){
            # print('---------------------------------')
            # print(paste0('Running differential expression test for: ', test_name))
            sce <- sce_cell

            pseudobulk_degs[[celltype]][[test_name]] <- list()

            # if (test_name == 'ad_vs_no'){
            #         #sce$pathology.group = ifelse(sce$pathology.group == "no", "no", "ad")
            #         ref_level <- tail(strsplit('ad_vs_no', "_vs_")[[1]], n = 1)
            #     } else if (test_name %in% c('late_vs_no', 'late_vs_early', 'early_vs_no')){
            #         #sce <- sce[, colData(sce)$pathology.group %in% c(strsplit(test_name, "_vs_")[[1]])]
            #         ref_level <- tail(strsplit(test_name, "_vs_")[[1]], n = 1)
            #     }
            
            ref_level = 'no'
            meta$pathology.group <- factor(meta$pathology.group)
            meta$pathology.group <- relevel(meta$pathology.group, ref = ref_level)

            for(de_method in pseudo_bulk_methods){
                print('----------------------------------')
                print(paste0('Obtaining statistics for ', toupper(test_name), ' with pseudo-bulk method ', toupper(de_method)))
                pseudobulk_degs[[celltype]][[test_name]][[de_method]] = run_de(sce,
                                                                                meta = meta,
                                                                                replicate_col = subject_id,
                                                                                cell_type_col = 'cell_type',
                                                                                label_col = 'pathology.group',
                                                                                latent_vars = latent_vars,
                                                                                test_name = test_name,
                                                                                ref_level = ref_level,
                                                                                min_cells = 3,
                                                                                min_reps = 2,
                                                                                min_features = 0,
                                                                                de_family = 'pseudobulk',
                                                                                de_method = strsplit(de_method, "-")[[1]][1],
                                                                                de_type = strsplit(de_method, "-")[[1]][2],
                                                                                n_threads = 2)
            }
        }

        print('........................')
            
    }
}    
else{

    print("Not evaluating DEGs with Pseudobulk Methods.")
}

[1] "--------------------------------"
[1] "Estimating DEGs in Excitatory..."
[1] "----------------------------------"
[1] "Obtaining statistics for LATE_VS_EARLY with pseudo-bulk method EDGER-QLF"
[1] "Excitatory"
[1] "----------------------------------"
[1] "Obtaining statistics for LATE_VS_EARLY with pseudo-bulk method DESEQ2-WALD"
[1] "Excitatory"
[1] "----------------------------------"
[1] "Obtaining statistics for LATE_VS_EARLY with pseudo-bulk method LIMMA-VOOM"
[1] "Excitatory"
[1] "----------------------------------"
[1] "Obtaining statistics for EARLY_VS_NO with pseudo-bulk method EDGER-QLF"
[1] "Excitatory"
[1] "----------------------------------"
[1] "Obtaining statistics for EARLY_VS_NO with pseudo-bulk method DESEQ2-WALD"
[1] "Excitatory"
[1] "----------------------------------"
[1] "Obtaining statistics for EARLY_VS_NO with pseudo-bulk method LIMMA-VOOM"
[1] "Excitatory"
[1] "----------------------------------"
[1] "Obtaining statistics for LATE_VS_NO with pseudo-bulk m

##### **Save Results**

In [13]:
if eval(dict(zip(deg_methods_to_run.names, list(deg_methods_to_run)))['pseudo_bulk_methods'][0].capitalize()):
    for test_name in test_names:
        for deg_method in pseudo_bulk_methods:
            with pd.ExcelWriter(f"../results/{test_name}/{save_prefix}_{deg_method}_degs.xlsx") as writer:
                for cell_type in celltypes:
                    
                    df = pseudobulk_degs[cell_type][test_name][deg_method].sort_values(by='p_val_adj')
                    df['abs_logFC'] = abs(df['avg_logFC'])
                    df['direction'] = df['avg_logFC'].apply(lambda x: "up" if x>0 else "down")
                    df.to_excel(writer, sheet_name=cell_type, na_rep='NA')
else:
    print('No result saved for Pseudobulk Methods')

### **Mixed Models**

In [14]:
%%R -i covariates -i test_names -i subject_id -i gene_celltype_threshold -i mixed_model_methods -o mixedmodel_degs

source('../scripts/functions/deg_functions/run_de.R')
source('../scripts/functions/deg_functions/pseudobulk_de.R')
source('../scripts/functions/deg_functions/check_inputs.R')
source('../scripts/functions/deg_functions/to_pseudobulk.R')
source('../scripts/functions/deg_functions/mixedmodel_de.R')
source('../scripts/functions/deg_functions/singlecell_de.R')

# celltypes <- c('Excitatory', 'Inhibitory', 'Astrocyte', 'Microglia', 'Oligodendrocyte, OPC')


if ('None' %in% covariates){
    latent_vars = NULL
} else {
    latent_vars = covariates
}

if (eval(deg_methods_to_run[['mixed_model_methods']])){ 
    mixedmodel_degs <- list()
    for (celltype in celltypes){

        mixedmodel_degs[[celltype]] <- list()

        # filter out celltypes
        print('--------------------------------')
        print(paste0('Estimating DEGs in ', celltype, '...'))
        sce_cell <- adata_annot[, adata_annot$cell_type == celltype]
        # print('----------------------------------')
        # print(paste0('Dimensions before filtering genes: ', 'n_genes: ',
        #      dim(assay(sce_cell))[1], ', n_cells: ', dim(assay(sce_cell))[2], '...'))

        assay(sce_cell, 'counts') <- assay(sce_cell)

        # Calculate the detection rate/frequency for each gene
        detection_rate <- rowMeans(counts(sce_cell) > 0)
        # Subset the `SingleCellExperiment` object to include only genes with >0.1 detection rate/frequency
        sce_cell <- sce_cell[detection_rate > gene_celltype_threshold, ]
        # print('----------------------------------')
        # print(paste0('Dimensions after filtering genes: ', 'n_genes: ',
        #      dim(assay(sce_cell))[1], ', n_cells: ', dim(assay(sce_cell))[2], '...'))

        # assay(sce_cell) <- assay(sce_cell, "counts")
        meta = colData(sce_cell)

        for (test_name in test_names){
            # print('---------------------------------')
            # print(paste0('Running differential expression test for: ', test_name))
            sce <- sce_cell

            mixedmodel_degs[[celltype]][[test_name]] <- list()

            if (test_name == 'ad_vs_no'){
                    sce$pathology.group = ifelse(sce$pathology.group == "no", "no", "ad")
                    ref_level <- tail(strsplit('ad_vs_no', "_vs_")[[1]], n = 1)
                } else if (test_name %in% c('late_vs_no', 'late_vs_early', 'early_vs_no')){
                    sce <- sce[, colData(sce)$pathology.group %in% c(strsplit(test_name, "_vs_")[[1]])]
                    ref_level <- tail(strsplit(test_name, "_vs_")[[1]], n = 1)
                }
            
            meta$pathology.group <- factor(meta$pathology.group)
            meta$pathology.group <- relevel(meta$pathology.group, ref = ref_level)

            for(de_method in mixed_model_methods){
                print('----------------------------------')
                print(paste0('Obtaining statistics for ', toupper(test_name), ' with single-cell method ', toupper(de_method)))
                mixedmodel_degs[[celltype]][[test_name]][[de_method]] = run_de(sce,
                                                                                meta = meta,
                                                                                replicate_col = subject_id,
                                                                                cell_type_col = 'cell_type',
                                                                                label_col = 'pathology.group',
                                                                                latent_vars = latent_vars,
                                                                                min_cells = 3,
                                                                                min_reps = 2,
                                                                                min_features = 0,
                                                                                de_family = 'mixedmodel',
                                                                                de_method = strsplit(de_method, "-")[[1]][1],
                                                                                de_type = strsplit(de_method, "-")[[1]][2],
                                                                                n_threads = 2)
            }
        }
        print('........................')
    }
}    
else{

    print("Not evaluating DEGs with Mixed-model Methods.")
    mixedmodel_degs = NULL
}

[1] "Not evaluating DEGs with Mixed-model Methods."


##### **Save Results**

In [15]:
if eval(dict(zip(deg_methods_to_run.names, list(deg_methods_to_run)))['mixed_model_methods'][0].capitalize()):
    for test_name in test_names:
        for deg_method in mixed_model_methods:
            with pd.ExcelWriter(f"../results/{test_name}/{save_prefix}_{deg_method}_degs.xlsx") as writer:
                for cell_type in celltypes:
                    
                    df = mixedmodel_degs[cell_type][test_name][deg_method].sort_values(by='p_val_adj')
                    df['abs_logFC'] = abs(df['avg_logFC'])
                    df['direction'] = df['avg_logFC'].apply(lambda x: "up" if x>0 else "down")
                    df.to_excel(writer, sheet_name=cell_type, na_rep='NA')
else:
    print('No result saved for Mixed-model Methods')

No result saved for Mixed-model Methods


### **Single-cell Models**

In [16]:
%%R -i covariates -i test_names -i subject_id -i gene_celltype_threshold -i single_cell_methods -o singlecell_degs

source('../scripts/functions/deg_functions/run_de.R')
source('../scripts/functions/deg_functions/pseudobulk_de.R')
source('../scripts/functions/deg_functions/check_inputs.R')
source('../scripts/functions/deg_functions/to_pseudobulk.R')
source('../scripts/functions/deg_functions/mixedmodel_de.R')
source('../scripts/functions/deg_functions/singlecell_de.R')

#celltypes <- c('Excitatory', 'Inhibitory', 'Astrocyte', 'Microglia', 'Oligodendrocyte', 'OPC')
#de_methods <- c('wilcox', 'bimod', 't', 'negbinom', 'LR', 'MAST')


if ('None' %in% covariates){
    latent_vars = NULL
} else {
    latent_vars = covariates
}

if (eval(deg_methods_to_run[['single_cell_methods']])){ 
    singlecell_degs <- list()
    for (celltype in celltypes){

        singlecell_degs[[celltype]] <- list()

        # filter out celltypes
        print('--------------------------------')
        print(paste0('Estimating DEGs in ', celltype, '...'))
        sce_cell <- adata_annot[, adata_annot$cell_type == celltype]
        # print('----------------------------------')
        # print(paste0('Dimensions before filtering genes: ', 'n_genes: ',
        #      dim(assay(sce_cell))[1], ', n_cells: ', dim(assay(sce_cell))[2], '...'))

        # Calculate the detection rate/frequency for each gene
        detection_rate <- rowMeans(counts(sce_cell) > 0)
        # Subset the `SingleCellExperiment` object to include only genes with >0.1 detection rate/frequency
        sce_cell <- sce_cell[detection_rate > gene_celltype_threshold, ]
        # print('----------------------------------')
        # print(paste0('Dimensions after filtering genes: ', 'n_genes: ',
        #      dim(assay(sce_cell))[1], ', n_cells: ', dim(assay(sce_cell))[2], '...'))

        # assay(sce_cell) <- assay(sce_cell, "counts")
        meta = colData(sce_cell)

        for (test_name in test_names){
            # print('---------------------------------')
            # print(paste0('Running differential expression test for: ', test_name))
            sce <- sce_cell

            singlecell_degs[[celltype]][[test_name]] <- list()

            if (test_name == 'ad_vs_no'){
                    sce$pathology.group = ifelse(sce$pathology.group == "no", "no", "ad")
                    ref_level <- tail(strsplit('ad_vs_no', "_vs_")[[1]], n = 1)
                } else if (test_name %in% c('late_vs_no', 'late_vs_early', 'early_vs_no')){
                    sce <- sce[, colData(sce)$pathology.group %in% c(strsplit(test_name, "_vs_")[[1]])]
                    ref_level <- tail(strsplit(test_name, "_vs_")[[1]], n = 1)
                }
            
            meta$pathology.group <- factor(meta$pathology.group)
            meta$pathology.group <- relevel(meta$pathology.group, ref = ref_level)

            for(de_method in single_cell_methods){
                print('----------------------------------')
                print(paste0('Obtaining statistics for ', toupper(test_name), ' with single-cell method ', toupper(de_method)))
                singlecell_degs[[celltype]][[test_name]][[de_method]] = run_de(sce,
                                                                                meta = meta,
                                                                                replicate_col = subject_id,
                                                                                cell_type_col = 'cell_type',
                                                                                label_col = 'pathology.group',
                                                                                latent_vars = latent_vars,
                                                                                min_cells = 3,
                                                                                min_reps = 2,
                                                                                min_features = 0,
                                                                                de_family = 'singlecell',
                                                                                de_method = de_method,
                                                                                de_type = NULL,
                                                                                n_threads = 2)
            }
        }

        print('........................')
    }

}    
else{

    print("Not evaluating DEGs with Single-cell Methods.")
    singlecell_degs = NULL
}

[1] "Not evaluating DEGs with Single-cell Methods."


##### **Save Results**

In [17]:
if eval(dict(zip(deg_methods_to_run.names, list(deg_methods_to_run)))['single_cell_methods'][0].capitalize()):
    for test_name in test_names:
        for deg_method in single_cell_methods:
            with pd.ExcelWriter(f"../results/{test_name}/{save_prefix}_{deg_method}_degs.xlsx") as writer:
                for cell_type in celltypes:
                    
                    df = singlecell_degs[cell_type][test_name][deg_method].sort_values(by='p_val_adj')
                    df['abs_logFC'] = abs(df['avg_logFC'])
                    df['direction'] = df['avg_logFC'].apply(lambda x: "up" if x>0 else "down")
                    df.to_excel(writer, sheet_name=cell_type, na_rep='NA')
else:
    print('No result saved for Mixed-model Methods')

No result saved for Mixed-model Methods


## **Consensus of Differential Expressed Transcripts**

The consistency of DEGs detected using the cell-level `Wilcoxon rank-sum` analysis model with those obtained with the `MAST` generalized mixed model and the `scANVI` deep generative model was assessed by comparing the directionality and rank of DEGs in the two models. Consistency in directionality was measured by counting the fraction of the top 1,000 DEGs (ranked by FDR scores) detected in cell-level analysis that showed consistent direction in the deep generative/mixed model.

For analyses involving DEG counts, [**Hansuredi et. al. 2019**](https://www.nature.com/articles/s41586-019-1195-2) used only genes that were significantly supported by both models using the criteria

- `FDR-corrected P < 0.01 in a two sided Wilcoxon-rank sum test`,
- `absolute log2(mean gene expression in AD category x/mean gene expression in AD category y) > 0.25`,
- `FDR-corrected P < 0.05 in the Poisson mixed model` 

Motivated by [**Zhou. et. al. 2020**](https://doi.org/10.1038/s41591-019-0695-9) who used estimated DEGs between conditions using the `MAST algorithm of the Seurat package in R`, we include the following filtering criteria for the 

- `DEGs in the generalized mixed effects model are obatined by filtering genes for log2(fold change) > 0.1, P < 0.05`. 
- `DEGs in the deep generative model are obatined by filtering genes for`  $$\ln(Bayes\ Factor) > 2.3 $$

### **Save Data**