In [1]:
import os
import scvi
import scgen
import rpy2
import scib
import json
import torch
import anndata
import logging
import warnings
import scanorama
import anndata2ri
import matplotlib
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import scrublet as scr
import doubletdetection
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from os import PathLike, fspath
import rpy2.robjects as robjects
from scipy.sparse import csr_matrix
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)
  from scipy.sparse.base import spmatrix


In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()





-----
anndata     0.8.0
scanpy      1.9.1
-----
OpenSSL                     22.0.0
PIL                         9.2.0
absl                        NA
adjustText                  NA
anndata2ri                  1.1
annoy                       NA
appnope                     0.1.2
asttokens                   NA
astunparse                  1.6.3
attr                        21.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
boto3                       1.26.32
botocore                    1.29.32
bottleneck                  1.3.5
brotli                      NA
certifi                     2022.09.24
cffi                        1.15.1
chex                        0.1.5
cloudpickle                 2.2.0
colorama                    0.4.4
contextlib2                 NA
cryptography                38.0.1
cycler                      0.10.0
cython_runtime              NA
dask                        2022.11.0
dateutil                    2.8.2
debugpy    

In [4]:
%%R
suppressPackageStartupMessages({
    library(reticulate)
    library(ggplot2)
    library(tidyr)
    library(dplyr)
    library(purrr)
    library(Seurat)
    library(tibble)
    library(magrittr) # needs to be run every time you start R and want to use %>%
})



    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

## Table of contents:

  * <a href=#Reading>1. Reading in the data</a>
  * <a href=#Preprocessing>2. Systematic differential analysis of gene expression</a>

# 1. Reading in the data

### [Mathys et. al. 2019](https://doi.org/10.1038/s41586-019-1195-2) (Prefrontal Cortex)

Now, we load the preprocessed and annotated data for downstream analysis.

In [5]:
adata_annot = sc.read_h5ad('../data/processed/adata_annotated.h5ad')
adata_annot.X = csr_matrix(adata_annot.layers['counts'].toarray())

In [6]:
adata_annot

AnnData object with n_obs × n_vars = 65619 × 16590
    obs: 'projid', 'fastq', 'Subject', 'sample', 'libraryid', 'study', 'age_death', 'educ', 'msex_x', 'gpath_x', 'amyloid_x', 'plaq_n_x', 'cogdx_x', 'pathologic diagnosis of AD', 'amyloid_y', 'plaq_n_y', 'nft', 'tangles', 'cogn_global_lv', 'gpath_y', 'gpath_3neocort', 'amyloid.group', 'caa_4gp', 'ceradsc', 'braaksc', 'niareagansc', 'cogdx_y', 'msex_y', 'pathology.group', 'sampleid', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'doublet_score', 'predicted_doublet', 'louvain_0.5', 'louvain_1.0', 'cell_type'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Subject_colors', 'amyloid.group_colors', 'braaksc_colors', 'cell_type_colors', 'dendrogram_louvain_0.5', 'hvg', 'log1p', 'louvain', 'louvain_0.5_colors', 'louvain_1.0_colors', 'ms

## 2.4 Systematic differential analysis of gene expression

[**Hansruedi Mathys et. al.**](https://doi.org/10.1038/s41586-019-1195-2) compared gene expression levels between `AD-pathology and no-pathology individuals in a cell type manner. The differential expression analysis was assessed using two tests. 

- **First**, a cell-level analysis was performed using the Wilcoxon rank-sum test and FDR multiple-testing correction (`FDR-adjusted p-values`). 

- **Second**, a Poisson mixed model accounting for the individual of origin for nuclei and for unwanted sources of variability was performed using the R packages `lme4` and `RUV-seq`, respectively.


Next, we use the ` Wilcoxon rank-sum test` in `scapany.tl.rank_genes_group` comparing `AD-pathology` group to `no-pathology` such that the log foldchange is ;

$$ Log_{2} ({Mean\ Gene\ Expression\ in\ AD\ category\ of\ Cell\ Type\ x \over Mean\ Gene\ Expression\ in\ Normal\ category\ of\ Cell\ Type\ x})$$


##### Group Cells (Pathology / No-Pathology)

In [7]:
adata_annot.obs['disease_group'] = None
adata_annot.obs.loc[adata_annot.obs['pathology.group'] == 'no-pathology', 'disease_group'] = 'no-pathology'
adata_annot.obs.loc[adata_annot.obs['pathology.group'].isin(['early-pathology', 'late-pathology']), 'disease_group'] = 'AD-pathology'

##### Load DEGs from Cell-type Differential Expression with Wilcoxon Rank-Sum Test

In [8]:
adata_annot.uns['log1p'] = {'base': None}

adata_sub = {}
degs_t_test = {}
for cell_type in adata_annot.obs.cell_type.unique():

    # adata_sub[cell_type] = adata_annot[adata_annot.obs.cell_type==cell_type]
    # adata_sub[cell_type].X = adata_sub[cell_type].layers['counts'].toarray()
    degs_t_test[cell_type] = pd.read_excel('../results/ad_vs_no/t_test_degs.xlsx', 
                                   sheet_name=cell_type)


Reported DEGs

Next, we load the reported DEGs from [**Hansruedi Mathys et. al.**](https://doi.org/10.1038/s41586-019-1195-2) for comparison in a later step

In [9]:
mathys_degs = {}
for key in adata_annot.obs.cell_type.unique():
    try:
        mathys_degs[key] = pd.read_excel(f'../data/raw/mathys_pfc_from_paper/degs/ad_vs_no/{key.lower()}_degs.xlsx')
        mathys_degs[key].rename({'Unnamed: 0': 'names'}, inplace=True)
        mathys_degs[key]['IndModel.absFC'] = np.abs(mathys_degs[key]['IndModel.FC'])

    except FileNotFoundError:
        continue

### Deep Generative Models for Detecting Differential Expression

Although we provide `MAST` implementation for all cell clusters using randomly sampled cells with less than 25000 cells in the AD-pathology and no-pathology groups, this parametric based approach scales very poorly to larger clusters (e.g `excitatory` with >20k cells). Thus, we instead employ a deep learning based approaches described by [**Pierre Boyeau et. al. 2022**](https://www.biorxiv.org/content/10.1101/794289v1) & [**Lopez et. al. 2018**](https://www.nature.com/articles/s41592-018-0229-2) and implemented in [**scvi-tools**](https://scvi-tools.org/). 

[**scvi-tools**](https://scvi-tools.org/) provides a suite of probabilistic single-cell variational inference tools for single-cell omics data.A semi-supervised model called `scANVI` (single-cell ANnotation using Variational Inference) uses cell type knowledge for a portion of the cells contained in the data sets to infer the states of the other cells. As a result, in addition to Differential Gene Expression analysis, scANVI can assist in annotating a data set of unlabelled cells from manually annotated atlases.

In the particular case of single-cell RNA-seq data, existing differential expression models often model that the mean expression level 
. as a linear function of the cell-state and batch assignments. These models face two notable limitations to detect differences in expression between cell-states in large-scale scRNA-seq datasets. First, such linear assumptions may not capture complex batch effects existing in such datasets accurately. When comparing two given states  and  in a large dataset, these models may also struggle to leverage data of a related state present in the data.

Existing differential expression models frequently represent the mean expression level in the specific situation of single-cell RNA-seq data as a linear function of the cell-state and batch assignments. To identify changes in expression between cell states in large-scale scRNA-seq datasets, these models have two significant drawbacks. First off, such linear hypotheses might not accurately represent complex batch effects present in such datasets. These models could have trouble utilizing data from a related state that is included in the data when comparing two given states and in a big dataset.

These issues might not apply to deep generative models. To capture batch effects on expression, the majority of scvi-tools models employ intricate nonlinear mappings. They can use a lot of data and amortization to better identify shared relationships between features. Deep generative models therefore have desirable characteristics for differential expression in big data.

In addition, these models can

- approximate population-specific normalized expression levels

- detect biologically relevant features

- provide easy-to-interpret predictions


Please see [**this page**](https://docs.scvi-tools.org/en/stable/user_guide/background/differential_expression.html) for further information 


### Explanation of `change` mode and Bayes Factors

Explanation adapted from the scVi [`differential_expression_score` docstring](https://github.com/YosefLab/scVI/blob/05920d1f85daa362d4fb694e588ab090bc84e207/scvi/inference/posterior.py#L640).

`scANVI` provides several ways to formulate the competing hypotheses from the effect-sizes equivalent to `log2 Fold-Change` to detect Differentially Expressed features. To avoid detecting features of little practical interest, e.g., when expression differences between conditions are significant but very subtle, we set the `mode` hyperparameter to `change`. In this formulation, we consider null hypotheses instead, such that

$$|\beta|\ \leq\ \delta\$$

Where, delta ($\delta$) is an hyperparameter that is equivalent to the `log2 Fold-Change` threshold, but can also be estimated by the model in a data-driven fashion. The **`change`** mode follows the protocol described in [Boyeau et al, bioRxiv 2019. doi: 10.1101/794289](https://doi.org/10.1101/794289)

Performing differential expression using the `change` mode consists in estimating an effect size random variable (e.g., log fold-change) and 
performing Bayesian hypothesis testing on this variable. 

The `change function` computes the effect size variable `r` based two inputs corresponding to the normalized means in both populations 

#### Hypotheses:

$M_1: r \in R_0$ (effect size r in region inducing differential expression)


$M_2: r \notin R_0$  (no differential expression)

To characterize the region $R_0$, the user has two choices. 

##### Option 1) Specify an interval
    
A common case is when the region $[-\delta, \delta]$ does not induce differential expression. If the user specifies a threshold delta, we suppose that $R_0 = \mathbb{R} \backslash [-\delta, \delta]$
    
##### Option 2) Specify an indicator function

Specify an specific indicator function $ \mathbb{1} : \mathbb{R} \mapsto \{0, 1\}  \text{  s.t.  }  r \in R_0 \iff \mathbb{1}(r) = 1$. Decision-making can then be based on the estimates of $p(M_1 | x_1, x_2)$

Differential Expression between conditions 1 and 2 for each gene can then be based on the Bayes factors:

$$
\text{Natural Log Bayes Factor for gene g in conditions 1 and 2} = \ln ( {BF^g_{12}) = \ln(\frac{ p(M^g_1 | x_1, x_2)}{p(M^g_2 | x_1, x_2)}})
$$

**The scvi `differential_expression_score` returns the _natural logarithm_ of the Bayes Factor. This is $\ln(BF_{10})$ in the table below.**

Bayes factors vs. p-values has been expertly reviewed by Leonhard Held and Manuela Ott in [On p-Values and Bayes Factors](https://doi.org/10.1146/annurev-statistics-031017-100307), as well as a shorter overview in [this blog post](https://www.nicebread.de/what-does-a-bayes-factor-feel-like/) by Felix Schönbrodt. 

A common interpretation table is copied below.  
In our notation, $BF_{10}$ is $BF^g_{12}$, $H_0$ is $M^g_1$ and $H_1$ is $M^g_2$
    
To compute the gene specific Bayes factors using masks idx1 and idx2 we sample the Posterior in the following way:

1. The posterior ($q(z_A | x_A)$ and $q(z_B | x_B)$) is sampled `n_samples` times for each subpopulation
    
2. For computation efficiency (posterior sampling is quite expensive), instead of
    comparing element-wise the obtained samples, we can permute posterior samples.
    

### **Interpreting Bayes factors**

A common interpretation table is copied below.  

In our notation, $BF_{10}$ is $BF^g_{12}$, $H_0$ is $M^g_1$ and $H_1$ is $M^g_2$
    
    
| Bayes factor $BF_{10}$ |  $\ln(BF_{10})$        | Interpretation              |
|------------------------|------------------------|-----------------------------|  
| > 100                  | > 4.60                 | Extreme evidence for H1     | 
| 30 – 100               | (3.4, 4.6)             | Very strong evidence for H1 | 
| 10 – 30                | (2.3, 3.4)             | Strong evidence for H1      | 
| 3 – 10                 | (1.1, 2.3)             | Moderate evidence for H1    | 
| 1 – 3                  | (0 , 1.1)              | Anecdotal evidence for H1   | 
| 1                      | 0                      | No evidence                 | 
| 1/3 – 1                | (-1.1, 0)              | Anecdotal evidence for H0   | 
| 1/3 – 1/10             | (-2.30, -1.1)          | Moderate evidence for H0    | 
| 1/10 – 1/30            | (-3.4, -2.30)          | Strong evidence for H0      | 
| 1/30 – 1/100           | (-4.6, -3.4)           | Very strong evidence for H0 | 
| < 1/100                | < -4.6                 | Extreme evidence for H0     | 

Please note that the `bayes_factor` value returned by scVI is $\ln(BF_{10})$. The difference in interpreting $\ln(BF_{10})$, $\ln(BF_{01})$, $BF_{10}$ and $BF_{01})$ is nicely explained in the blog post [A short taxonomy of Bayes factors](https://www.nicebread.de/a-short-taxonomy-of-bayes-factors/) by Felix Schönbrodt.



#### Interpretation of differential expression results

- `bayes_factor`: The bayes factor for condition 1 having a higher expression than condition 2

- `proba_de`: p-value for condition **i** having higher expression than the other.

- `raw_mean`**`i`**: the average number of UMI counts sampled condition **i** during the DE. Because scvi is a generative model, it is generating new cells and counting how many times each gene is sampled to compute the DE.

- `raw_normalized_mean`**`i`**: the average library size normalized expression for that group (divide by library size, multiply by 10,000).

- `non_zeros_proportion`**`i`**: proportion of non-zero expression in the group/condition **i**

- `scale_`**`i`**: average scVI inferred gene expression scale in group/condition **i**. This is the same value that can be obtained using 
`model.get_normalized_expression()` using `library_size=1`, equivalent to the "expression frequency" of the gene: what fraction of the UMIs sampled from that cell would belong to that gene. 

##### Set-up model and train for All Clusters

In [34]:
def train_scanvi(adata: dict,
                layer: str='counts',
                labels_key: str='cell_type',
                unlabeled_category: str='unknown',
                batch_key: str='Subject',
                categorical_covariate_keys: list=['Subject'],
                train: bool=True,
                train_kwargs: dict =  dict(use_gpu=False, batch_size=64, early_stopping=True, 
                                            max_epochs=100, early_stopping_min_delta=100,
                                            early_stopping_patience=15,
                                            early_stopping_mode='min')                 
                ):

    degs_scanvi = {}

    for key in adata_sub.keys():

        adata_temp = adata[key].copy()

        scvi.model.SCANVI.setup_anndata(adata_temp, layer=layer, labels_key=labels_key,
                                        unlabeled_category=unlabeled_category, batch_key=batch_key,
                                        categorical_covariate_keys=categorical_covariate_keys,
                                        )

        logger = TensorBoardLogger(save_dir=f'../models/scanvi_{key}_degs/')

        train_kwargs['logger'] = logger

        if train:
            model = scvi.model.SCANVI(adata_temp, gene_likelihood='zinb')

            print(f'Training model for {key}...')

            model.train(**train_kwargs)
            model.save('../models/', prefix=f'scanvi_{key.lower()}_', overwrite=True)
        else:
            model = scvi.model.SCANVI.load('../models/', adata=adata_temp, prefix=f'scanvi_{key.lower()}_',)
        
        adata[key].obsm["X_scANVI"] = model.get_latent_representation()

        print(f'Estimating DEGs for {key} cluster...')

        degs_scanvi[key] = model.differential_expression(adata=adata_temp, groupby='disease_group',
                                                         group1='AD-pathology', group2='no-pathology', mode='change',
                                                         delta=None, batch_size=None, all_stats=True, batch_correction=True,
                                                         fdr_target=0.05, silent=False, **dict(n_samples=5000))
                                        
        degs_scanvi[key].reset_index(inplace=True)
        degs_scanvi[key].rename(columns={'index': 'names'}, inplace=True)
        degs_scanvi[key]['odds_ratio'] = np.exp(degs_scanvi[key]['bayes_factor'])
        
        print("\n")

    return degs_scanvi


In [35]:
# degs_scanvi = train_scanvi(adata=adata_sub, layer='counts', labels_key='disease_group', batch_key='Subject',
#                             unlabeled_category='unknown', categorical_covariate_keys=['Subject',], train=True,
#                             train_kwargs=dict(use_gpu=False, batch_size=64, early_stopping=True, max_epochs=100, 
#                                             early_stopping_min_delta=100, early_stopping_patience=15, early_stopping_mode='min') 
#                             )

#### Consensus of Differential Expressed Transcripts

The consistency of DEGs detected using the cell-level `Wilcoxon rank-sum` analysis model with those obtained with the `MAST` generalized mixed model and the `scANVI` deep generative model was assessed by comparing the directionality and rank of DEGs in the two models. Consistency in directionality was measured by counting the fraction of the top 1,000 DEGs (ranked by FDR scores) detected in cell-level analysis that showed consistent direction in the deep generative/mixed model.

For analyses involving DEG counts, [**Hansuredi et. al. 2019**](https://www.nature.com/articles/s41586-019-1195-2) used only genes that were significantly supported by both models using the criteria

- `FDR-corrected P < 0.01 in a two sided Wilcoxon-rank sum test`,
- `absolute log2(mean gene expression in AD category x/mean gene expression in AD category y) > 0.25`,
- `FDR-corrected P < 0.05 in the Poisson mixed model` 

Motivated by [**Zhou. et. al. 2020**](https://doi.org/10.1038/s41591-019-0695-9) who used estimated DEGs between conditions using the `MAST algorithm of the Seurat package in R`, we include the following filtering criteria for the 

- `DEGs in the generalized mixed effects model are obatined by filtering genes for log2(fold change) > 0.1, P < 0.05`. 
- `DEGs in the deep generative model are obatined by filtering genes for`  $$\ln(Bayes\ Factor) > 2.3 $$

In [36]:
mast_overlap = {}
mast_consistency  = {}

for key in adata_annot.obs.cell_type.unique():
    mast_overlap[key] = pd.merge(degs_t_test[key], degs_mast[key], how='outer', on=['names'])

    mast_consistency[key] = len(list(set(list(degs_t_test[key]['names'][:1000])) &
                            set(mast_overlap[key][((mast_overlap[key]['avg_log2FC']>0) & (mast_overlap[key]['logfoldchanges']>0))|
                            ((mast_overlap[key]['avg_log2FC']<0) & (mast_overlap[key]['logfoldchanges']<0))].names)))/1000

    mast_overlap[key]['bulk_log2FC'] = np.log2((mast_overlap[key]['pct.1']+0.005)/
                                            (mast_overlap[key]['pct.2']+0.005)) + mast_overlap[key]['avg_log2FC']

    mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] = np.NaN

    mast_overlap[key]['DEGs.Ind.Model'] = False
    mast_overlap[key].loc[(mast_overlap[key]['pvals_adj'] < 0.01) & 
                        (mast_overlap[key]['abs_logfoldchanges'] > 0.25), 'DEGs.Ind.Model'] = True

    pvals = mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'], 'p_val_adj'].tolist()
                        
    try:
        mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'],
                        'p_val_adj_DEGs.Ind.Model'] = multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
        
        mast_overlap[key]['DEGs.Ind.Mixed.Model'] = False
        mast_overlap[key].loc[(mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] < 0.05), 'DEGs.Ind.Mixed.Model'] = True
        
    except ZeroDivisionError:
        continue

In [37]:
degs_t_test['Excitatory']

Unnamed: 0.1,Unnamed: 0,names,scores,pvals,pvals_adj,logfoldchanges,abs_logfoldchanges,Direction_t_test
0,0,RASGEF1B,30.155712,9.027398e-200,1.428766e-195,1.758242,1.758242,up
1,1,LINGO1,29.114767,2.334517e-186,1.847420e-182,1.433463,1.433463,up
2,2,SLC26A3,24.047064,8.959428e-128,4.726696e-124,1.057443,1.057443,up
3,3,BEX1,-23.102089,4.411363e-118,1.396373e-114,-0.872032,0.872032,down
4,4,SPARCL1,-23.105984,4.031128e-118,1.396373e-114,-1.169944,1.169944,down
...,...,...,...,...,...,...,...,...
15822,15822,ZNF580,-0.100866,9.196568e-01,9.998724e-01,-1.700520,1.700520,down
15823,15823,SSX2,-0.100854,9.196663e-01,9.998724e-01,-1.212336,1.212336,down
15824,15824,RPEL1,-0.100728,9.197667e-01,9.998724e-01,-0.302013,0.302013,down
15825,15825,DMRTC2,-0.109230,9.130199e-01,9.998724e-01,-0.307460,0.307460,down


In [38]:
# Initialize empty dictionaries to store the results
mast_overlap = {}
mast_consistency = {}

# Loop through each cell type
for key in adata_annot.obs.cell_type.unique():
    # Merge DEGs detected by t-test and MAST algorithms based on their gene names
    mast_overlap[key] = pd.merge(degs_t_test[key], degs_mast[key], how='outer', on=['names'])

    # Calculate the consistency in directionality of the top 1000 DEGs between t-test and MAST algorithms
    mast_consistency[key] = len(list(set(list(degs_t_test[key]['names'][:1000])) &
                            set(mast_overlap[key][((mast_overlap[key]['avg_log2FC']>0) & (mast_overlap[key]['logfoldchanges']>0))|
                            ((mast_overlap[key]['avg_log2FC']<0) & (mast_overlap[key]['logfoldchanges']<0))].names)))/1000

    # Calculate the bulk log2FC of each gene by taking the log2 ratio of the percentage of cells expressing the gene in two conditions
    # and adding the average log2FC calculated from MAST and t-test algorithms
    mast_overlap[key]['bulk_log2FC'] = np.log2((mast_overlap[key]['pct.1']+0.005)/
                                            (mast_overlap[key]['pct.2']+0.005)) + mast_overlap[key]['avg_log2FC']

    # Initialize columns for adjusted p-values and DEG identification in the merged table
    mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] = np.NaN
    mast_overlap[key]['DEGs.Ind.Model'] = False
    
    # Mark genes as DEGs if they meet certain criteria based on FDR-corrected p-values and absolute log2 fold change
    mast_overlap[key].loc[(mast_overlap[key]['pvals_adj'] < 0.01) & 
                        (mast_overlap[key]['abs_logfoldchanges'] > 0.25), 'DEGs.Ind.Model'] = True

    # Get the p-values of identified DEGs
    pvals = mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'], 'p_val_adj'].tolist()
                        
    try:
        # Apply FDR correction to the p-values of identified DEGs
        mast_overlap[key].loc[mast_overlap[key]['DEGs.Ind.Model'],
                        'p_val_adj_DEGs.Ind.Model'] = multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
        
        # Identify DEGs in the mixed model based on the FDR-corrected p-values
        mast_overlap[key]['DEGs.Ind.Mixed.Model'] = False
        mast_overlap[key].loc[(mast_overlap[key]['p_val_adj_DEGs.Ind.Model'] < 0.05), 'DEGs.Ind.Mixed.Model'] = True
        
    except ZeroDivisionError:
        # Skip if no DEGs are found
        continue


View some DEGs from mathys data

In [39]:
mathys_degs['Excitatory'][(mathys_degs['Excitatory']['DEGs.Ind.Model'])&
                           (mathys_degs['Excitatory']['DEGs.Ind.Mix.models'])]

Unnamed: 0.1,Unnamed: 0,IndModel.adj.pvals,no.pathology.mean,pathology.mean,IndModel.FC,MixedModel.z,MixedModel.p,DEGs.Ind.Model,DEGs.Ind.Mix.models,IndModel.absFC
0,RASGEF1B,0.000000e+00,3.195138,5.950515,0.897136,6.127535,8.925109e-10,True,True,0.897136
1,NGFRAP1,0.000000e+00,1.837262,1.160450,-0.662872,-11.221532,3.196845e-29,True,True,0.662872
2,LINGO1,6.259506e-298,2.884947,4.381583,0.602907,5.689687,1.272728e-08,True,True,0.602907
3,BEX1,8.203347e-283,1.582531,1.039826,-0.605891,-9.953163,2.442933e-23,True,True,0.605891
4,SLC26A3,2.121809e-276,1.750697,3.262202,0.897916,6.540229,6.142477e-11,True,True,0.897916
...,...,...,...,...,...,...,...,...,...,...
13004,TNFRSF25,7.630976e-03,0.238967,0.312709,0.388009,4.637332,3.529353e-06,True,True,0.388009
13025,RP11-618P17.4,8.239413e-03,0.310741,0.410157,0.400463,4.307534,1.650848e-05,True,True,0.400463
13068,FZD1,8.869463e-03,0.011419,0.020506,0.844692,4.687224,2.769360e-06,True,True,0.844692
13105,PDZD4,9.460903e-03,0.254717,0.398181,0.644528,8.728511,2.580469e-18,True,True,0.644528


View some DEGs from re-analysis 

In [40]:
mast_overlap['Astrocyte'][(mast_overlap['Excitatory']['DEGs.Ind.Model'])&
                           (mast_overlap['Excitatory']['DEGs.Ind.Mixed.Model'])]

Unnamed: 0.1,Unnamed: 0,names,scores,pvals,pvals_adj,logfoldchanges,abs_logfoldchanges,Direction_t_test,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,diff,FDR,bulk_log2FC,p_val_adj_DEGs.Ind.Model,DEGs.Ind.Model,DEGs.Ind.Mixed.Model
0,0.0,GFAP,9.283654,1.637632e-20,2.129904e-16,1.128408,1.128408,up,7.630713e-23,0.316605,0.559,0.411,1.265935e-18,0.148,1.265935e-18,0.755717,,True,False
1,1.0,NRXN1,-8.830657,1.040630e-18,6.767220e-15,-1.276099,1.276099,down,5.718907e-13,-0.142494,0.898,0.942,9.487666e-09,-0.044,9.487666e-09,-0.211132,,True,False
2,2.0,PRKG1,8.440312,3.164897e-17,1.029066e-13,0.988118,0.988118,up,3.192618e-19,0.213602,0.761,0.685,5.296554e-15,0.076,5.296554e-15,0.364350,,True,False
3,3.0,TENM2,-8.466842,2.521362e-17,1.029066e-13,-0.734385,0.734385,down,1.081682e-16,-0.458044,0.498,0.639,1.794511e-12,-0.141,1.794511e-12,-0.814546,,True,False
4,4.0,RASGEF1B,7.965983,1.639156e-15,4.263772e-12,0.476628,0.476628,up,3.981474e-14,0.509415,0.629,0.512,6.605266e-10,0.117,6.605266e-10,0.803734,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6394,6394.0,MMAB,-0.689976,4.902093e-01,9.998465e-01,-0.416674,0.416674,down,2.127330e-01,-0.039396,0.050,0.064,1.000000e+00,-0.014,1.000000e+00,-0.366560,,False,False
6416,6416.0,UBE2J1,-0.713720,4.754006e-01,9.998465e-01,-0.203220,0.203220,down,8.506776e-02,-0.027932,0.068,0.083,1.000000e+00,-0.015,1.000000e+00,-0.297539,,False,False
6430,6430.0,JUP,-0.699657,4.841416e-01,9.998465e-01,-0.592525,0.592525,down,7.678922e-02,-0.038278,0.026,0.040,1.000000e+00,-0.014,1.000000e+00,-0.575935,,False,False
6474,6474.0,TMTC2,-0.661018,5.086006e-01,9.998465e-01,0.053248,0.053248,up,1.571967e-02,-0.079056,0.498,0.528,1.000000e+00,-0.030,1.000000e+00,-0.162633,,False,False


### **Save Data**

In [41]:
with pd.ExcelWriter("../results/ad_vs_no/mixed_model_degs.xlsx") as writer:
    for cell_type in adata_annot.obs.cell_type.unique():
        mast_overlap[cell_type].to_excel(writer, sheet_name=cell_type, na_rep='NA')