In [1]:
import os
import scvi
import scgen
import rpy2
import scib
import json
import torch
import anndata
import logging
import warnings
import scanorama
import anndata2ri
import matplotlib
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import scrublet as scr
import doubletdetection
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from os import PathLike, fspath
import rpy2.robjects as robjects
from scipy.sparse import csr_matrix
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)
  from scipy.sparse.base import spmatrix


In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()





-----
anndata     0.8.0
scanpy      1.9.1
-----
OpenSSL                     22.0.0
PIL                         9.2.0
absl                        NA
adjustText                  NA
anndata2ri                  1.1
annoy                       NA
appnope                     0.1.2
asttokens                   NA
astunparse                  1.6.3
attr                        21.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
boto3                       1.26.32
botocore                    1.29.32
bottleneck                  1.3.5
brotli                      NA
certifi                     2022.09.24
cffi                        1.15.1
chex                        0.1.5
cloudpickle                 2.2.0
colorama                    0.4.4
contextlib2                 NA
cryptography                38.0.1
cycler                      0.10.0
cython_runtime              NA
dask                        2022.11.0
dateutil                    2.8.2
debugpy    

In [4]:
%%R
suppressPackageStartupMessages({
    library(reticulate)
    library(ggplot2)
    library(tidyr)
    library(dplyr)
    library(purrr)
    library(Seurat)
    library(tibble)
    library(magrittr) # needs to be run every time you start R and want to use %>%
})



    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

## Table of contents:

  * <a href=#Reading>1. Reading in the data</a>
  * <a href=#Preprocessing>2. Systematic differential analysis of gene expression</a>

# **1. Reading in the data**

### **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

In [5]:
save_prefix = 'mathys_pfc'

adata_annot = sc.read_h5ad(f'../data/processed/{save_prefix}/{save_prefix}_mapped_anndata.h5ad')
adata_annot.X = csr_matrix(adata_annot.layers['counts'].toarray())

In [6]:
adata_annot

AnnData object with n_obs × n_vars = 65619 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'cell_type_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors', 'ms

### **Prepare metadata**

Now we specify other related information

Specify the following:

- `metadata`: Path to metadata. Metadata must contain a column called `pathology.group` with the only unique groups being `no`, `early`, and `late`.

- `map_meta`: whether to map metadata to obtain `pathology.group`. If False, it will be assumed that `pathology.group` exist in `adata.obs`

- `reference_group`: Name of control group in metadata. This should ideally be `no`, representing the control group 

- `test_group`: A list of the name of the group(s) in metadata/ `.obs` that should be treated as the test groups.d 

- `save_prefix`: Prefix for saving critical files. preferably chosen to be in the format `{source name}_{brain region}`. e.g `mathys_pfc`
        

In [3]:
metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv'
reference_group = 'no'                  # name of the control group in metadata 
test_groups = ['late', 'early']         # list of covariates to be accounted for in regression.
filter_genes = True
gene_celltype_threshold = 0.05          # determines number of cells the gene must be expressed in 
test_name = '_n_'.join(test_groups)+'_vs_'+reference_group

## 2.4 Systematic differential analysis of gene expression

[**Hansruedi Mathys et. al.**](https://doi.org/10.1038/s41586-019-1195-2) compared gene expression levels between `AD-pathology and no-pathology individuals in a cell type manner. The differential expression analysis was assessed using two tests. 

- **First**, a cell-level analysis was performed using the Wilcoxon rank-sum test and FDR multiple-testing correction (`FDR-adjusted p-values`). 

- **Second**, a Poisson mixed model accounting for the individual of origin for nuclei and for unwanted sources of variability was performed using the R packages `lme4` and `RUV-seq`, respectively.


Next, we use the ` Wilcoxon rank-sum test` in `scapany.tl.rank_genes_group` comparing `AD-pathology` group to `no-pathology` such that the log foldchange is ;

$$ Log_{2} ({Mean\ Gene\ Expression\ in\ AD\ category\ of\ Cell\ Type\ x \over Mean\ Gene\ Expression\ in\ Normal\ category\ of\ Cell\ Type\ x})$$


##### Group Cells (Pathology / No-Pathology)

In [7]:
if disease_group in 
adata_annot.obs['disease_group'] = None
adata_annot.obs.loc[adata_annot.obs['pathology.group'] == 'no-pathology', 'disease_group'] = 'no-pathology'
adata_annot.obs.loc[adata_annot.obs['pathology.group'].isin(['early-pathology', 'late-pathology']), 'disease_group'] = 'AD-pathology'

##### Load DEGs from Cell-type Differential Expression with Wilcoxon Rank-Sum Test

In [8]:
adata_annot.uns['log1p'] = {'base': None}

adata_sub = {}
degs_t_test = {}
for cell_type in adata_annot.obs.cell_type.unique():

    # adata_sub[cell_type] = adata_annot[adata_annot.obs.cell_type==cell_type]
    # adata_sub[cell_type].X = adata_sub[cell_type].layers['counts'].toarray()
    degs_t_test[cell_type] = pd.read_excel('../results/ad_vs_no/t_test_degs.xlsx', 
                                   sheet_name=cell_type)


Reported DEGs

Next, we load the reported DEGs from [**Hansruedi Mathys et. al.**](https://doi.org/10.1038/s41586-019-1195-2) for comparison in a later step

In [9]:
mathys_degs = {}
for key in adata_annot.obs.cell_type.unique():
    try:
        mathys_degs[key] = pd.read_excel(f'../data/raw/mathys_pfc_from_paper/degs/ad_vs_no/{key.lower()}_degs.xlsx')
        mathys_degs[key].rename({'Unnamed: 0': 'names'}, inplace=True)
        mathys_degs[key]['IndModel.absFC'] = np.abs(mathys_degs[key]['IndModel.FC'])

    except FileNotFoundError:
        continue

The authors assessed the consistency of DEGs detected using the cell-level analysis model (`Wilcoxon rank-sum test`) with those obtained with the Poisson mixed model (`lme4` combined with `RUV-seq`) by comparing the directionality and rank of DEGs in the two models. Considering the technical challenges, of using these packages, we instead benchmark the `rank-sum test` results against `MAST`, a generalized linear model for modelling scRNA-seq data in R, developed by [**Finak, G. et. al. 2015**](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0844-5). In addition to accounting for the the sample (`Subject`) of origin for each nuclei, we include the cellular detection rate, the fraction of genes expressed in a cell as a sources of unwanted variation/variability. Sveral benchmarking studies have shown that MAST is comparitvely better than other methods for scRNA-seq differential expression testing and is a commonly used tool in scRNA-seq analysis. Thus, We perform differential testing using the `MAST package`. 

Since MAST is only available in R, we convert our AnnData object into an R object via `anndata2ri`. MAST requires its own data input format, `SingleCellAssy` instead of the `SingleCellExperiment` object produced by the `anndata2ri` conversion. So, to run MAST we thus first put the data into the SingleCellExperiment format, then convert the SCE object into MAST's expected SingleCellAssay (sca) object. 

#### MAST implementation details

Here, we perform differential testing for AD-pathology vs No-pathology `disease_group`s in each of the cell clusters. Since `MAST` incorporates a zero-inflated negative binomial model tests for differential expression using a hurdle model, it can be very computationally intensive for large data sizes. To remedy this, `we randomly split each data in a stratified manner to inlude <5000 cells across both conditions` while retaining the relative proportion of cells in both test groups (AD-pathology vs No-pathology). This is expected to retain the expression patterns in both tests groups, while speeding up computation

In the generalized linear mixed model (specified and fit with the `zlm()` function), we include the test covariate `disease_group` (AD-pathology vs No-pathology), the `Subject` ID (ROS1 -- ROS48), and the number of genes in the cells, reculated from the log normalized values stored in `adata.layers['log']`. The `Subject` IDs are included as random effects to account for unwanted variability/effects due to individual nuclei orgin, mouse-specific that may confound our results. The number of genes is added to fit the technical variability.

To test for differences over the `disease_group` covariate we perform a likelihood ratio test (in the `summary()` function call after fitting the model). 

In post-processing we correct for multiple testing using a Benjamini-Hochberg FDR correction (function `p.adjust()`) and map the Ensembl Gene IDs to gene symbols which are easier to read and interpret.

### **Run MAST on Excitatory**

In [10]:
# Create new Anndata object for use in MAST with non-batch corrected data as before

adata_exc = adata_annot[adata_annot.obs.cell_type=='Excitatory']
# adata_exc.X = adata_exc.layers['counts'].toarray()
adata_exc.obs['n_genes'] = (adata_exc.X > 0).sum(1) # recompute number of genes expressed per cell

# split data into smaller batches stratified such that the proportion of test groups remains the same
# This is used to avoid kernel crashes over too large data

strata = adata_exc.obs.disease_group

adata_train, adata_test = train_test_split(adata_exc, test_size=0.25, train_size=None, random_state=None,
                                            shuffle=True, stratify=strata)
# adata_test.X = adata_test.X.toarray()
# adata_train.X = adata_train.X.toarray()

adata_exc = adata_test

del adata_train, adata_test

adata_exc

View of AnnData object with n_obs × n_vars = 7487 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type', 'disease_group'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors',

In [11]:
%%R -i adata_exc -o exc_degs

library(Seurat)

print("running hurdle model on Excitatory cells...")
cur_seurat_obj <- as.Seurat(adata_exc, counts = "counts", data = 'log', verbose=TRUE)
Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group
cur_seurat_obj$amyloid <- as.numeric(cur_seurat_obj$amyloid_x)

exc_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
                        min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

exc_degs$diff <- exc_degs$pct.1 - exc_degs$pct.2
exc_degs$FDR <- exc_degs$p_val_adj
print("Finished...")
print("--------------------------------------")



[1] "running hurdle model on Excitatory cells..."
[1] "Finished..."
[1] "--------------------------------------"


In [12]:
del adata_exc

### **Run MAST on Inhibitory**

In [13]:
# Create new Anndata object for use in MAST with non-batch corrected data as before

adata_inh = adata_annot[adata_annot.obs.cell_type=='Inhibitory']
#adata_inh.X = adata_sub['Inhibitory'].layers['counts']
adata_inh.obs['n_genes'] = (adata_inh.X > 0).sum(1) # recompute number of genes expressed per cell

# split data into smaller batches stratified such that the proportion of test groups remains the same
# This is used to avoid kernel crashes over too large data

strata = adata_inh.obs.disease_group

adata_train, adata_test = train_test_split(adata_inh, test_size=0.5, train_size=None, random_state=None,
                                            shuffle=True, stratify=strata)
# adata_test.X = adata_test.X.toarray()
# adata_train.X = adata_train.X.toarray()

adata_inh = adata_test

del adata_train, adata_test

adata_inh

View of AnnData object with n_obs × n_vars = 3886 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type', 'disease_group'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors',

In [14]:
%%R -i adata_inh -o inh_degs

print("------------------------------------------")
print("running hurdle model on Inhibitory cells...")
cur_seurat_obj <- as.Seurat(adata_inh, counts = "counts", data = "log")
Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

inh_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
                        min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

inh_degs$diff <- inh_degs$pct.1 - inh_degs$pct.2
inh_degs$FDR <- inh_degs$p_val_adj


print("Finished...")
print("--------------------------------------")



[1] "------------------------------------------"
[1] "running hurdle model on Inhibitory cells..."
[1] "Finished..."
[1] "--------------------------------------"


In [15]:
del adata_inh

### **Run MAST on Oligodendrocytes**

In [16]:
# Create new Anndata object for use in MAST with non-batch corrected data as before

adata_oli = adata_annot[adata_annot.obs.cell_type=='Oligodendrocyte']
#adata_oli.X = adata_sub['Oligodendrocyte'].layers['counts']
adata_oli.obs['n_genes'] = (adata_oli.X > 0).sum(1) # recompute number of genes expressed per cell

# split data into smaller batches stratified such that the proportion of test groups remains the same

strata = adata_oli.obs.disease_group
adata_train, adata_test = train_test_split(adata_oli, test_size=0.25, train_size=None, random_state=None,
                                            shuffle=True, stratify=strata)

# adata_test.X = adata_test.X.toarray()
# adata_train.X = adata_train.X.toarray()

adata_oli = adata_test

del adata_train, adata_test

adata_oli

View of AnnData object with n_obs × n_vars = 4883 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type', 'disease_group'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors',

In [17]:
%%R -i adata_oli -o oli_degs

print("------------------------------------------")
print("running hurdle model on Oligodendrocytes...")
cur_seurat_obj <- as.Seurat(adata_oli, counts = "counts", data = "log")
Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

oli_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
                        min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

oli_degs$diff <- oli_degs$pct.1 - oli_degs$pct.2
oli_degs$FDR <- oli_degs$p_val_adj
print("Finished...")
print("--------------------------------------")




[1] "------------------------------------------"
[1] "running hurdle model on Oligodendrocytes..."
[1] "Finished..."
[1] "--------------------------------------"


In [18]:
del adata_oli

### **Run MAST on Astrocytes**

In [19]:
adata_ast = adata_annot[adata_annot.obs.cell_type=='Astrocyte']
#adata_ast.X = adata_sub['Astrocyte'].layers['counts'].toarray()
adata_ast.obs['n_genes'] = (adata_ast.X > 0).sum(1) # recompute number of genes expressed per cell

adata_ast

AnnData object with n_obs × n_vars = 3470 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type', 'disease_group'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors', 'msex_x

In [20]:
%%R -i adata_ast -o ast_degs

print("------------------------------------------")
print("running hurdle model on Astrocytes...")
cur_seurat_obj <- as.Seurat(adata_ast, counts = "counts", data = "log")
Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

ast_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
                        min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

ast_degs$diff <- ast_degs$pct.1 - ast_degs$pct.2
ast_degs$FDR <- ast_degs$p_val_adj
print("Finished...")
print("--------------------------------------")




[1] "------------------------------------------"
[1] "running hurdle model on Astrocytes..."
[1] "Finished..."
[1] "--------------------------------------"


In [21]:
del adata_ast

### **Run MAST on Endothelial Cells**

In [22]:
adata_end = adata_annot[adata_annot.obs.cell_type=='Endothelial']
#adata_end.X = adata_sub['Endothelial'].layers['counts'].toarray()
adata_end.obs['n_genes'] = (adata_end.X > 0).sum(1) # recompute number of genes expressed per cell

adata_end

AnnData object with n_obs × n_vars = 322 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type', 'disease_group'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors', 'msex_x_

In [23]:
%%R -i adata_end -o end_degs 


print("------------------------------------------")
print("running hurdle model on Endothelial cells...")
cur_seurat_obj <- as.Seurat(adata_end, counts = "counts", data = "log")
Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

end_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
                        min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

end_degs$diff <- end_degs$pct.1 - end_degs$pct.2
end_degs$FDR <- end_degs$p_val_adj
print("Finished...")
print("--------------------------------------")



[1] "------------------------------------------"
[1] "running hurdle model on Endothelial cells..."
[1] "Finished..."
[1] "--------------------------------------"


In [24]:
del adata_end

### **Run MAST on Microglia Cells**

In [25]:
adata_mic = adata_annot[adata_annot.obs.cell_type=='Microglia']
#adata_mic.X = adata_sub['Microglia'].layers['counts'].toarray()
adata_mic.obs['n_genes'] = (adata_mic.X > 0).sum(1) # recompute number of genes expressed per cell

adata_mic

AnnData object with n_obs × n_vars = 1961 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type', 'disease_group'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors', 'msex_x

In [26]:
%%R -i adata_mic -o mic_degs

print("------------------------------------------")
print("running hurdle model on Microglia...")
cur_seurat_obj <- as.Seurat(adata_mic, counts = "counts", data = "log")
Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

mic_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
                        min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

mic_degs$diff <- mic_degs$pct.1 - mic_degs$pct.2
mic_degs$FDR <- mic_degs$p_val_adj

print("Finished...")
print("--------------------------------------")



[1] "------------------------------------------"
[1] "running hurdle model on Microglia..."
[1] "Finished..."
[1] "--------------------------------------"


In [27]:
del adata_mic

### **Run MAST on OPCs**

In [28]:
adata_opc = adata_annot[adata_annot.obs.cell_type=='OPC']
#adata_opc.X = adata_sub['OPC'].layers['counts'].toarray()
adata_opc.obs['n_genes'] = (adata_opc.X > 0).sum(1) # recompute number of genes expressed per cell

adata_opc

AnnData object with n_obs × n_vars = 2618 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type', 'disease_group'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors', 'msex_x

In [29]:
%%R  -i adata_opc -o opc_degs

print("------------------------------------------")
print("running hurdle model on OPCs...")
cur_seurat_obj <- as.Seurat(adata_opc, counts = "counts", data = "log")
Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

opc_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
                        min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

opc_degs$diff <- opc_degs$pct.1 - opc_degs$pct.2
opc_degs$FDR <- opc_degs$p_val_adj

print("Finished...")
print("--------------------------------------")

rm(cur_seurat_obj)



[1] "------------------------------------------"
[1] "running hurdle model on OPCs..."
[1] "Finished..."
[1] "--------------------------------------"


In [30]:
del adata_opc

Run Mast

In [31]:
# %%R -i adata_exc -i adata_inh -i adata_oli -i adata_mic -i adata_ast -i adata_opc -i adata_end -o exc_degs 
# #-o inh_degs -o oli_degs -o ast_degs -o mic_degs -o opc_degs -o end_degs 

# library(MAST)
# library(parallel)

# print('parsing data...')

# #Convert SingleCellExperiment to SingleCellAssay type as required by MAST
# exc_sca <- SceToSingleCellAssay(adata_exc, class="SingleCellAssay")
# inh_sca <- SceToSingleCellAssay(adata_inh, class="SingleCellAssay")
# oli_sca <- SceToSingleCellAssay(adata_oli, class="SingleCellAssay")
# ast_sca <- SceToSingleCellAssay(adata_ast, class="SingleCellAssay")
# mic_sca <- SceToSingleCellAssay(adata_mic, class="SingleCellAssay")
# opc_sca <- SceToSingleCellAssay(adata_opc, class="SingleCellAssay")
# end_sca <- SceToSingleCellAssay(adata_end, class="SingleCellAssay")

# print('parsing done...')
# rm(adata_exc, adata_inh, adata_oli, adata_ast, adata_mic, adata_end, adata_opc)

# print("Deleted unused variables...")

# #Scale Gene detection rate
# colData(exc_sca)$n_genes = scale(colData(exc_sca)$n_genes)
# colData(inh_sca)$n_genes = scale(colData(inh_sca)$n_genes)
# colData(oli_sca)$n_genes = scale(colData(oli_sca)$n_genes)
# colData(ast_sca)$n_genes = scale(colData(ast_sca)$n_genes)
# colData(mic_sca)$n_genes = scale(colData(mic_sca)$n_genes)
# colData(opc_sca)$n_genes = scale(colData(opc_sca)$n_genes)
# colData(end_sca)$n_genes = scale(colData(end_sca)$n_genes)

# print('Filtering non-expressed genes...')

# exc_sca_filt = exc_sca[rowSums(assay(exc_sca)) != 0, ]
# inh_sca_filt = inh_sca[rowSums(assay(inh_sca)) != 0, ]
# oli_sca_filt = oli_sca[rowSums(assay(oli_sca)) != 0, ]
# mic_sca_filt = mic_sca[rowSums(assay(mic_sca)) != 0, ]
# ast_sca_filt = ast_sca[rowSums(assay(ast_sca)) != 0, ]
# end_sca_filt = end_sca[rowSums(assay(end_sca)) != 0, ]
# opc_sca_filt = opc_sca[rowSums(assay(opc_sca)) != 0, ]

# rm(exc_sca, oli_sca, mic_sca, ast_sca, end_sca, opc_sca, inh_sca)

# #Define & run hurdle model excitatory cells
# print("-------------------------------------------")
# print("running hurdle model on excitatory cells...")

# cond<-factor(colData(exc_sca_filt)$disease_group)
# cond<-relevel(cond,"no-pathology")
# colData(exc_sca_filt)$disease_group<-cond

# # zlmCond <- zlm(formula = ~disease_group + Subject + n_genes, sca=sca_filt)
# zlmCond <- zlm(formula = ~disease_group + n_genes, sca=exc_sca_filt)

# print("finished hurdle model")
# print("Estimating summary")
# summaryCond <- summary(zlmCond, doLRT='disease_groupAD-pathology')
# summaryDt <- summaryCond$datatable

# result <- merge(summaryDt[contrast=='disease_groupAD-pathology' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
#                     summaryDt[contrast=='disease_groupAD-pathology' & component=='logFC', .(primerid, coef)],
#                     by='primerid') #logFC coefficients

# #Correct for multiple testing (FDR correction) and filtering
# result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# exc_degs = result
# exc_degs = exc_degs[order(exc_degs$FDR),]
# print("Finished...")
# print("--------------------------------")
# rm(exc_sca_filt, zlmCond, summaryCond, summaryDt, result)


# #Define & run hurdle model inhibitory cells
# print("-------------------------------------------")
# print("running hurdle model on inhibitory cells...")
# # zlmCond <- zlm(formula = ~disease_group + Subject + n_genes, sca=sca_filt)
# zlmCond <- zlm(formula = ~disease_group + n_genes, sca=inh_sca_filt)

# print("finished hurdle model")
# print("Estimating summary")
# summaryCond <- summary(zlmCond, doLRT='disease_groupno-pathology')
# summaryDt <- summaryCond$datatable

# result <- merge(summaryDt[contrast=='disease_groupno-pathology' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
#                     summaryDt[contrast=='disease_groupno-pathology' & component=='logFC', .(primerid, coef)],
#                     by='primerid') #logFC coefficients

# #Correct for multiple testing (FDR correction) and filtering
# result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# inh_degs = result
# inh_degs = inh_degs[order(inh_degs$FDR),]
# print("Finished...")
# print("--------------------------------")
# rm(inh_sca_filt, zlmCond, summaryCond, summaryDt, result)



# #Define & run hurdle model oligodendrocytes
# print("------------------------------------------")
# print("running hurdle model on oligodendrocyte...")
# # zlmCond <- zlm(formula = ~disease_group + Subject + n_genes, sca=sca_filt)
# zlmCond <- zlm(formula = ~disease_group + n_genes, sca=oli_sca_filt)

# print("finished hurdle model")
# print("Estimating summary")
# summaryCond <- summary(zlmCond, doLRT='disease_groupno-pathology')
# summaryDt <- summaryCond$datatable

# result <- merge(summaryDt[contrast=='disease_groupno-pathology' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
#                     summaryDt[contrast=='disease_groupno-pathology' & component=='logFC', .(primerid, coef)],
#                     by='primerid') #logFC coefficients

# #Correct for multiple testing (FDR correction) and filtering
# result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# oli_degs = result
# oli_degs = oli_degs[order(oli_degs$FDR),]
# print("Finished...")
# print("--------------------------------")
# rm(oli_sca_filt, zlmCond, summaryCond, summaryDt, result)



# #Define & run hurdle model astrocytes
# print("--------------------------------------")
# print("running hurdle model on astrocytes...")
# # zlmCond <- zlm(formula = ~disease_group + Subject + n_genes, sca=sca_filt)
# zlmCond <- zlm(formula = ~disease_group + n_genes, sca=ast_sca_filt)

# print("finished hurdle model")
# print("Estimating summary")
# summaryCond <- summary(zlmCond, doLRT='disease_groupno-pathology')
# summaryDt <- summaryCond$datatable

# result <- merge(summaryDt[contrast=='disease_groupno-pathology' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
#                     summaryDt[contrast=='disease_groupno-pathology' & component=='logFC', .(primerid, coef)],
#                     by='primerid') #logFC coefficients

# #Correct for multiple testing (FDR correction) and filtering
# result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# ast_degs = result
# ast_degs = ast_degs[order(ast_degs$FDR),]
# print("Finished...")
# print("--------------------------------")
# rm(ast_sca_filt, zlmCond, summaryCond, summaryDt, result)


# #Define & run hurdle model microglia
# print("------------------------------------")
# print("running hurdle model on microglia...")
# # zlmCond <- zlm(formula = ~disease_group + Subject + n_genes, sca=sca_filt)
# zlmCond <- zlm(formula = ~disease_group + n_genes, sca=mic_sca_filt)

# print("finished hurdle model")
# print("Estimating summary")
# summaryCond <- summary(zlmCond, doLRT='disease_groupno-pathology')
# summaryDt <- summaryCond$datatable

# result <- merge(summaryDt[contrast=='disease_groupno-pathology' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
#                     summaryDt[contrast=='disease_groupno-pathology' & component=='logFC', .(primerid, coef)],
#                     by='primerid') #logFC coefficients

# #Correct for multiple testing (FDR correction) and filtering
# result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# mic_degs = result
# mic_degs = mic_degs[order(mic_degs$FDR),]
# print("Finished...")
# print("--------------------------------")
# rm(mic_sca_filt, zlmCond, summaryCond, summaryDt, result)


# #Define & run hurdle model OPC
# print("--------------------------------")
# print("running hurdle model on OPCs...")
# # zlmCond <- zlm(formula = ~disease_group + Subject + n_genes, sca=sca_filt)
# zlmCond <- zlm(formula = ~disease_group + n_genes, sca=opc_sca_filt)

# print("finished hurdle model")
# print("Estimating summary")
# summaryCond <- summary(zlmCond, doLRT='disease_groupno-pathology')
# summaryDt <- summaryCond$datatable

# result <- merge(summaryDt[contrast=='disease_groupno-pathology' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
#                     summaryDt[contrast=='disease_groupno-pathology' & component=='logFC', .(primerid, coef)],
#                     by='primerid') #logFC coefficients

# #Correct for multiple testing (FDR correction) and filtering
# result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# opc_degs = result
# opc_degs = opc_degs[order(opc_degs$FDR),]
# print("Finished...")
# print("--------------------------------")
# rm(opc_sca_filt, zlmCond, summaryCond, summaryDt, result)


# #Define & run hurdle model endothelial
# print("--------------------------------------")
# print("running hurdle model on endothelial...")
# # zlmCond <- zlm(formula = ~disease_group + Subject + n_genes, sca=sca_filt)
# zlmCond <- zlm(formula = ~disease_group + n_genes, sca=end_sca_filt)

# print("finished hurdle model")
# print("Estimating summary")
# summaryCond <- summary(zlmCond, doLRT='disease_groupno-pathology')
# summaryDt <- summaryCond$datatable

# result <- merge(summaryDt[contrast=='disease_groupno-pathology' & component=='H',.(primerid, `Pr(>Chisq)`)], #P-vals
#                     summaryDt[contrast=='disease_groupno-pathology' & component=='logFC', .(primerid, coef)],
#                     by='primerid') #logFC coefficients

# #Correct for multiple testing (FDR correction) and filtering
# result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# end_degs = result
# end_degs = end_degs[order(end_degs$FDR),]
# print("Finished...")
# print("--------------------------------")
# rm(end_sca_filt, zlmCond, summaryCond, summaryDt, result)


In [32]:
# %%R -i adata_exc -i adata_inh -i adata_oli -i adata_mic -i adata_ast -i adata_opc -i adata_end  \
#     -o exc_degs -o inh_degs -o oli_degs -o ast_degs -o mic_degs -o opc_degs -o end_degs 

# library(Seurat)

# print("running hurdle model on Excitatory cells...")
# cur_seurat_obj <- as.Seurat(adata_exc, counts = "counts", data = "log")
# Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group
# cur_seurat_obj$amyloid <- as.numeric(cur_seurat_obj$amyloid_x)

# exc_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
#                         min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

# exc_degs$diff <- exc_degs$pct.1 - exc_degs$pct.2
# exc_degs$FDR <- exc_degs$p_val_adj
# print("Finished...")
# print("--------------------------------------")


# print("------------------------------------------")
# print("running hurdle model on Inhibitory cells...")
# cur_seurat_obj <- as.Seurat(adata_inh, counts = "counts", data = "log")
# Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

# inh_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
#                         min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

# inh_degs$diff <- inh_degs$pct.1 - inh_degs$pct.2
# inh_degs$FDR <- inh_degs$p_val_adj
# print("Finished...")
# print("--------------------------------------")


# print("------------------------------------------")
# print("running hurdle model on Oligodendrocytes...")
# cur_seurat_obj <- as.Seurat(adata_oli, counts = "counts", data = "log")
# Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

# oli_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
#                         min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

# oli_degs$diff <- oli_degs$pct.1 - oli_degs$pct.2
# oli_degs$FDR <- oli_degs$p_val_adj
# print("Finished...")
# print("--------------------------------------")

# print("------------------------------------------")
# print("running hurdle model on Astrocytes...")
# cur_seurat_obj <- as.Seurat(adata_ast, counts = "counts", data = "log")
# Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

# ast_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
#                         min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

# ast_degs$diff <- ast_degs$pct.1 - ast_degs$pct.2
# ast_degs$FDR <- ast_degs$p_val_adj
# print("Finished...")
# print("--------------------------------------")


# print("------------------------------------------")
# print("running hurdle model on Microglia...")
# cur_seurat_obj <- as.Seurat(adata_mic, counts = "counts", data = "log")
# Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

# mic_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
#                         min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

# mic_degs$diff <- mic_degs$pct.1 - mic_degs$pct.2
# mic_degs$FDR <- mic_degs$p_val_adj

# print("Finished...")
# print("--------------------------------------")


# print("------------------------------------------")
# print("running hurdle model on Endothelial cells...")
# cur_seurat_obj <- as.Seurat(adata_end, counts = "counts", data = "log")
# Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

# end_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
#                         min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

# end_degs$diff <- end_degs$pct.1 - end_degs$pct.2
# end_degs$FDR <- end_degs$p_val_adj
# print("Finished...")
# print("--------------------------------------")



# print("------------------------------------------")
# print("running hurdle model on OPCs...")
# cur_seurat_obj <- as.Seurat(adata_opc, counts = "counts", data = "log")
# Idents(cur_seurat_obj) <- cur_seurat_obj$disease_group

# opc_degs <- FindMarkers(object = cur_seurat_obj, ident.1='AD-pathology', ident.2='no-pathology', only.pos = FALSE, 
#                         min.pct = 0.0, test.use = "MAST", logfc.threshold = 0.00, verbose=TRUE,)

# opc_degs$diff <- opc_degs$pct.1 - opc_degs$pct.2
# opc_degs$FDR <- opc_degs$p_val_adj

# print("Finished...")
# print("--------------------------------------")

# rm(cur_seurat_obj)

#### **Prep MAST results for all cell types**

In [33]:
degs_mast = {}
degs_mast['Excitatory'] = exc_degs.sort_values(by='FDR')
degs_mast['Inhibitory'] = inh_degs.sort_values(by='FDR')
degs_mast['Oligodendrocyte'] = oli_degs.sort_values(by='FDR')
degs_mast['Astrocyte'] = ast_degs.sort_values(by='FDR')
degs_mast['OPC'] = opc_degs.sort_values(by='FDR')
degs_mast['Microglia'] = mic_degs.sort_values(by='FDR')
degs_mast['Endothelial'] = end_degs.sort_values(by='FDR')

for key in degs_mast.keys():
    degs_mast[key].reset_index(inplace=True)
    degs_mast[key].rename(columns={"index": "names"}, inplace=True)

#### Consensus of Differential Expressed Transcripts

The consistency of DEGs detected using the cell-level `Wilcoxon rank-sum` analysis model with those obtained with the `MAST` generalized mixed model and the `scANVI` deep generative model was assessed by comparing the directionality and rank of DEGs in the two models. Consistency in directionality was measured by counting the fraction of the top 1,000 DEGs (ranked by FDR scores) detected in cell-level analysis that showed consistent direction in the deep generative/mixed model.

For analyses involving DEG counts, [**Hansuredi et. al. 2019**](https://www.nature.com/articles/s41586-019-1195-2) used only genes that were significantly supported by both models using the criteria

- `FDR-corrected P < 0.01 in a two sided Wilcoxon-rank sum test`,
- `absolute log2(mean gene expression in AD category x/mean gene expression in AD category y) > 0.25`,
- `FDR-corrected P < 0.05 in the Poisson mixed model` 

Motivated by [**Zhou. et. al. 2020**](https://doi.org/10.1038/s41591-019-0695-9) who used estimated DEGs between conditions using the `MAST algorithm of the Seurat package in R`, we include the following filtering criteria for the 

- `DEGs in the generalized mixed effects model are obatined by filtering genes for log2(fold change) > 0.1, P < 0.05`. 
- `DEGs in the deep generative model are obatined by filtering genes for`  $$\ln(Bayes\ Factor) > 2.3 $$

In [36]:
mast_overlap = {}
mast_consistency  = {}

for key in adata_annot.obs.cell_type.unique():
    mast_overlap[key] = pd.merge(degs_t_test[key], degs_mast[key], how='outer', on=['names'])

    mast_consistency[key] = len(list(set(list(degs_t_test[key]['names'][:1000])) &
                            set(mast_overlap[key][((mast_overlap[key]['avg_log2FC']>0) & (mast_overlap[key]['logfoldchanges']>0))|
                            ((mast_overlap[key]['avg_log2FC']<0) & (mast_overlap[key]['logfoldchanges']<0))].names)))/1000

    mast_overlap[key]['bulk_log2FC'] = np.log2((mast_overlap[key]['pct.1']+0.005)/
                                            (mast_overlap[key]['pct.2']+0.005)) + mast_overlap[key]['avg_log2FC']

    mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] = np.NaN

    mast_overlap[key]['DEGs.Ind.Model'] = False
    mast_overlap[key].loc[(mast_overlap[key]['pvals_adj'] < 0.01) & 
                        (mast_overlap[key]['abs_logfoldchanges'] > 0.25), 'DEGs.Ind.Model'] = True

    pvals = mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'], 'p_val_adj'].tolist()
                        
    try:
        mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'],
                        'p_val_adj_DEGs.Ind.Model'] = multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
        
        mast_overlap[key]['DEGs.Ind.Mixed.Model'] = False
        mast_overlap[key].loc[(mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] < 0.05), 'DEGs.Ind.Mixed.Model'] = True
        
    except ZeroDivisionError:
        continue

In [37]:
degs_t_test['Excitatory']

Unnamed: 0.1,Unnamed: 0,names,scores,pvals,pvals_adj,logfoldchanges,abs_logfoldchanges,Direction_t_test
0,0,RASGEF1B,30.155712,9.027398e-200,1.428766e-195,1.758242,1.758242,up
1,1,LINGO1,29.114767,2.334517e-186,1.847420e-182,1.433463,1.433463,up
2,2,SLC26A3,24.047064,8.959428e-128,4.726696e-124,1.057443,1.057443,up
3,3,BEX1,-23.102089,4.411363e-118,1.396373e-114,-0.872032,0.872032,down
4,4,SPARCL1,-23.105984,4.031128e-118,1.396373e-114,-1.169944,1.169944,down
...,...,...,...,...,...,...,...,...
15822,15822,ZNF580,-0.100866,9.196568e-01,9.998724e-01,-1.700520,1.700520,down
15823,15823,SSX2,-0.100854,9.196663e-01,9.998724e-01,-1.212336,1.212336,down
15824,15824,RPEL1,-0.100728,9.197667e-01,9.998724e-01,-0.302013,0.302013,down
15825,15825,DMRTC2,-0.109230,9.130199e-01,9.998724e-01,-0.307460,0.307460,down


In [38]:
# Initialize empty dictionaries to store the results
mast_overlap = {}
mast_consistency = {}

# Loop through each cell type
for key in adata_annot.obs.cell_type.unique():
    # Merge DEGs detected by t-test and MAST algorithms based on their gene names
    mast_overlap[key] = pd.merge(degs_t_test[key], degs_mast[key], how='outer', on=['names'])

    # Calculate the consistency in directionality of the top 1000 DEGs between t-test and MAST algorithms
    mast_consistency[key] = len(list(set(list(degs_t_test[key]['names'][:1000])) &
                            set(mast_overlap[key][((mast_overlap[key]['avg_log2FC']>0) & (mast_overlap[key]['logfoldchanges']>0))|
                            ((mast_overlap[key]['avg_log2FC']<0) & (mast_overlap[key]['logfoldchanges']<0))].names)))/1000

    # Calculate the bulk log2FC of each gene by taking the log2 ratio of the percentage of cells expressing the gene in two conditions
    # and adding the average log2FC calculated from MAST and t-test algorithms
    mast_overlap[key]['bulk_log2FC'] = np.log2((mast_overlap[key]['pct.1']+0.005)/
                                            (mast_overlap[key]['pct.2']+0.005)) + mast_overlap[key]['avg_log2FC']

    # Initialize columns for adjusted p-values and DEG identification in the merged table
    mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] = np.NaN
    mast_overlap[key]['DEGs.Ind.Model'] = False
    
    # Mark genes as DEGs if they meet certain criteria based on FDR-corrected p-values and absolute log2 fold change
    mast_overlap[key].loc[(mast_overlap[key]['pvals_adj'] < 0.01) & 
                        (mast_overlap[key]['abs_logfoldchanges'] > 0.25), 'DEGs.Ind.Model'] = True

    # Get the p-values of identified DEGs
    pvals = mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'], 'p_val_adj'].tolist()
                        
    try:
        # Apply FDR correction to the p-values of identified DEGs
        mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'],
                        'p_val_adj_DEGs.Ind.Model'] = multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
        
        # Identify DEGs in the mixed model based on the FDR-corrected p-values
        mast_overlap[key]['DEGs.Ind.Mixed.Model'] = False
        mast_overlap[key].loc[(mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] < 0.05), 'DEGs.Ind.Mixed.Model'] = True
        
    except ZeroDivisionError:
        # Skip if no DEGs are found
        continue


View some DEGs from mathys data

In [39]:
mathys_degs['Excitatory'][(mathys_degs['Excitatory']['DEGs.Ind.Model'])&
                           (mathys_degs['Excitatory']['DEGs.Ind.Mix.models'])]

Unnamed: 0.1,Unnamed: 0,IndModel.adj.pvals,no.pathology.mean,pathology.mean,IndModel.FC,MixedModel.z,MixedModel.p,DEGs.Ind.Model,DEGs.Ind.Mix.models,IndModel.absFC
0,RASGEF1B,0.000000e+00,3.195138,5.950515,0.897136,6.127535,8.925109e-10,True,True,0.897136
1,NGFRAP1,0.000000e+00,1.837262,1.160450,-0.662872,-11.221532,3.196845e-29,True,True,0.662872
2,LINGO1,6.259506e-298,2.884947,4.381583,0.602907,5.689687,1.272728e-08,True,True,0.602907
3,BEX1,8.203347e-283,1.582531,1.039826,-0.605891,-9.953163,2.442933e-23,True,True,0.605891
4,SLC26A3,2.121809e-276,1.750697,3.262202,0.897916,6.540229,6.142477e-11,True,True,0.897916
...,...,...,...,...,...,...,...,...,...,...
13004,TNFRSF25,7.630976e-03,0.238967,0.312709,0.388009,4.637332,3.529353e-06,True,True,0.388009
13025,RP11-618P17.4,8.239413e-03,0.310741,0.410157,0.400463,4.307534,1.650848e-05,True,True,0.400463
13068,FZD1,8.869463e-03,0.011419,0.020506,0.844692,4.687224,2.769360e-06,True,True,0.844692
13105,PDZD4,9.460903e-03,0.254717,0.398181,0.644528,8.728511,2.580469e-18,True,True,0.644528


View some DEGs from re-analysis 

In [40]:
mast_overlap['Astrocyte'][(mast_overlap['Excitatory']['DEGs.Ind.Model'])&
                           (mast_overlap['Excitatory']['DEGs.Ind.Mixed.Model'])]

Unnamed: 0.1,Unnamed: 0,names,scores,pvals,pvals_adj,logfoldchanges,abs_logfoldchanges,Direction_t_test,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,diff,FDR,bulk_log2FC,p_val_adj_DEGs.Ind.Model,DEGs.Ind.Model,DEGs.Ind.Mixed.Model
0,0.0,GFAP,9.283654,1.637632e-20,2.129904e-16,1.128408,1.128408,up,7.630713e-23,0.316605,0.559,0.411,1.265935e-18,0.148,1.265935e-18,0.755717,,True,False
1,1.0,NRXN1,-8.830657,1.040630e-18,6.767220e-15,-1.276099,1.276099,down,5.718907e-13,-0.142494,0.898,0.942,9.487666e-09,-0.044,9.487666e-09,-0.211132,,True,False
2,2.0,PRKG1,8.440312,3.164897e-17,1.029066e-13,0.988118,0.988118,up,3.192618e-19,0.213602,0.761,0.685,5.296554e-15,0.076,5.296554e-15,0.364350,,True,False
3,3.0,TENM2,-8.466842,2.521362e-17,1.029066e-13,-0.734385,0.734385,down,1.081682e-16,-0.458044,0.498,0.639,1.794511e-12,-0.141,1.794511e-12,-0.814546,,True,False
4,4.0,RASGEF1B,7.965983,1.639156e-15,4.263772e-12,0.476628,0.476628,up,3.981474e-14,0.509415,0.629,0.512,6.605266e-10,0.117,6.605266e-10,0.803734,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6394,6394.0,MMAB,-0.689976,4.902093e-01,9.998465e-01,-0.416674,0.416674,down,2.127330e-01,-0.039396,0.050,0.064,1.000000e+00,-0.014,1.000000e+00,-0.366560,,False,False
6416,6416.0,UBE2J1,-0.713720,4.754006e-01,9.998465e-01,-0.203220,0.203220,down,8.506776e-02,-0.027932,0.068,0.083,1.000000e+00,-0.015,1.000000e+00,-0.297539,,False,False
6430,6430.0,JUP,-0.699657,4.841416e-01,9.998465e-01,-0.592525,0.592525,down,7.678922e-02,-0.038278,0.026,0.040,1.000000e+00,-0.014,1.000000e+00,-0.575935,,False,False
6474,6474.0,TMTC2,-0.661018,5.086006e-01,9.998465e-01,0.053248,0.053248,up,1.571967e-02,-0.079056,0.498,0.528,1.000000e+00,-0.030,1.000000e+00,-0.162633,,False,False


### **Save Data**

In [41]:
with pd.ExcelWriter("../results/ad_vs_no/mixed_model_degs.xlsx") as writer:
    for cell_type in adata_annot.obs.cell_type.unique():
        mast_overlap[cell_type].to_excel(writer, sheet_name=cell_type, na_rep='NA')