In [1]:
import os
import scvi
import rpy2
import scib
import json
import torch
import anndata
import logging
import warnings
import scanorama
import anndata2ri
import matplotlib
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import scrublet as scr
import doubletdetection
import decoupler as dc
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from os import PathLike, fspath
import rpy2.robjects as robjects
from scipy.sparse import csr_matrix
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from rpy2.robjects.packages import importr
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

  self.seed = seed
  self.dl_pin_memory_gpu_training = (
  from scipy.sparse.base import spmatrix
  if not hasattr(tensorboard, "__version__") or LooseVersion(
  ) < LooseVersion("1.15"):
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  __import__("pkg_resources").declare_namespace(__name__)


In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()





-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
absl                        NA
aiobotocore                 2.5.3
aiohttp                     3.8.4
aioitertools                0.11.0
aiosignal                   1.3.1
anndata2ri                  1.1
annoy                       NA
anyio                       NA
appnope                     0.1.3
asttokens                   NA
async_timeout               4.0.2
attr                        23.1.0
backcall                    0.2.0
botocore                    1.31.17
bs4                         4.12.2
certifi                     2023.05.07
cffi                        1.15.1
charset_normalizer          3.1.0
chex                        0.1.7
click                       8.1.3
comm                        0.1.3
contextlib2                 NA
croniter                    NA
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator  

  mod_version = _find_version(mod.__version__)


In [4]:
%%R
suppressPackageStartupMessages({
    library(reticulate)
    library(ggplot2)
    library(tidyr)
    library(dplyr)
    library(purrr)
    library(Seurat)
    library(tibble)
    library(magrittr) 
    library(forcats)
    library(Matrix)
    library(stats)
    library(tester)
    library(Seurat)
    library(methods)
    library(matrixStats)
    library(edgeR)
    library(DESeq2)
    library(limma)
    library(pbmcapply)
    library(parallel)
    library(lmerTest)
    library(lme4)
    library(glmmTMB)
    library(blme)
# needs to be run every time you start R and want to use %>%
})



    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

  displaypub.publish_display_data(


1: package ‘DESeq2’ was built under R version 4.3.1 
2: package ‘S4Vectors’ was built under R version 4.3.1 
3: package ‘IRanges’ was built under R version 4.3.1 
4: package ‘GenomeInfoDb’ was built under R version 4.3.1 
5: package ‘MatrixGenerics’ was built under R version 4.3.1 
6: In checkMatrixPackageVersion() :
  Package version inconsistency detected.
TMB was built with Matrix version 1.6.0
Current Matrix version is 1.6.1.1
Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
7: In checkDepPackageVersion(dep_pkg = "TMB") :
  Package version inconsistency detected.
glmmTMB was built with TMB version 1.9.3
Current TMB version is 1.9.6
Please re-install glmmTMB from source or restore original ‘TMB’ package (see '?reinstalling' for more information)


# **1. Reading in the data**

## **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

Please set `get_cell_types=True` if, `cell_type` column is absent or contains celltype annotation not of the form

- `Excitatory`, `Inhibitory`, `Astrocyte`, `Oligodendrocyte`, `OPC`, `Microglia`, `Endothelial`.

In [5]:
save_prefix = 'gazestani_pfc'
get_cell_types = False

adata_annot = sc.read_h5ad(f'../data/raw/{save_prefix}/{save_prefix}_raw_anndata.h5ad')
adata_annot.obs_names_make_unique()
adata_annot.var_names_make_unique()

if 'counts' not in adata_annot.layers.keys():
    adata_annot.layers['counts'] = adata_annot.X.copy()
else:
    adata_annot.X = adata_annot.layers['counts'].copy()

layer_keys = list(adata_annot.layers.keys())

for layer in layer_keys:
    if layer != 'counts':
        del adata_annot.layers[layer]

adata_annot.X = adata_annot.X.astype(np.float32)
adata_annot.layers['counts'] = adata_annot.layers['counts'].astype(np.float32)
    
del adata_annot.obsm, adata_annot.varm, adata_annot.uns, adata_annot.obsp


In [6]:
adata_annot

AnnData object with n_obs × n_vars = 892828 × 38199
    obs: 'nUMI', 'nGene', 'dataset', 'clusters', 'QC_Gene_total_count_x', 'QC_Gene_unique_count_x', 'QC_MT.pct_x', 'QC_IEG.pct_x', 'QC_top50_pct_x', 'status', 'anno_batch_x', 'sample', 'anno_organism', 'ds_batch', 'anno_batch_y', 'anno_orig_cellState', 'anno_age', 'anno_sex', 'anno_First_author', 'anno_pmid', 'anno_region', 'anno_ctype', 'anno_class', 'anno_braak_score', 'anno_condition', 'anno_RNAseq_profiling_method', 'anno_RNAseq_platform', 'QC_MT.pct_y', 'QC_IEG.pct_y', 'QC_Gene_total_count_y', 'QC_Gene_unique_count_y', 'QC_top50_pct_y', 'UMAP_1', 'UMAP_2', 'cell_barcode', 'anno_ctype2', 'QC_lncRNA_pct', 'percent.mito', 'dataset2', 'QC_OXPHOS.pct', 'status2', 'pathology.group', 'cell_type', 'individualID'
    layers: 'counts'

In [7]:
celltypes = ["OPC"]
#"Excitatory", "Inhibitory", "Astrocyte", "Microglia", "Oligodendrocyte", "OPC", "Endothelial"

mapping = {'leng_etc':
           
           {'Exc': 'Excitatory', 
            'Inh': 'Inhibitory', 
            'Astro': 'Astrocyte',
            'Endo': 'Endothelial', 
            'Micro': 'Microglia', 
            'OPC': 'OPC', 
            'Oligo': 'Oligodendrocyte'},

           'leng_sfg':
           
           {'Exc': 'Excitatory', 
            'Inh': 'Inhibitory', 
            'Astro': 'Astrocyte',
            'Endo': 'Endothelial', 
            'Micro': 'Microglia', 
            'OPC': 'OPC', 
            'Oligo': 'Oligodendrocyte'},
           
           'allen_mtg':
           
           {'Excitatory': 'Excitatory', 
            'Inhibitory': 'Inhibitory',
            'Astrocyte': 'Astrocyte',
            'Microglia': 'Microglia', 
            'Endothelial': 'Endothelial', 
            'OPC': 'OPC',
            'Oligodendrocyte': 'Oligodendrocyte'},
            
            'seaad_mtg':
           
           {'Excitatory': 'Excitatory', 
            'Inhibitory': 'Inhibitory',
            'Astrocyte': 'Astrocyte',
            'Microglia': 'Microglia', 
            'Endothelial': 'Endothelial', 
            'OPC': 'OPC',
            'Oligodendrocyte': 'Oligodendrocyte'},

            "gazestani_pfc":

            {'ExN': 'Excitatory', 
                     'InN': 'Inhibitory',
                     'Astro': 'Astrocyte',
                     'MG': 'Microglia',
                     'OPC': 'OPC',
                     'Oligo': 'Oligodendrocyte',
                     'Endo': 'Endothelial'},
            }

cell_column = {'leng_etc': 'clusterCellType',
               'leng_sfg': 'clusterCellType',
               'allen_mtg': 'cell_labels',
               'seaad_mtg': 'cell_type',
               'gazestani_pfc': 'cell_type',
               }

In [8]:
if get_cell_types:
    adata_annot.obs['cell_type'] = adata_annot.obs[cell_column[save_prefix]].map(mapping[save_prefix])

adata_annot = adata_annot[adata_annot.obs.cell_type.isin(celltypes)]

## **Differential Expression Frameworks**


### **Overview:**

Based on the findings of **[Gazestani. et. al. 2023](https://www.sciencedirect.com/science/article/pii/S0092867423008590?via%3Dihub)**, this framework employs a pseudocell strategy. This approach combines mixed linear models and jack-knifing techniques to robustly identify differentially expressed genes

### **Pseudocell Construction and DEG Identification:**

1. **Aggregation:** We first aggregate the raw UMI count of, on average, every 30 cells per subject and cell type. We constructed one pseudocell for cell types that had between 15 to 45 cells in a donor and excluded cell types that had less than 15 cells. This pseudocell-based analysis reduces the impact of dropout and technical variability, while ameliorating low statistical power and high variation in sample size issues attributed to the pseudobulk approaches.

2. **Differential Testing:** We then used the `Limma Trend` approach with robust moderated t-statistic to identify DE genes within each cell class with specififc coveraiates such as `sex`, `cell type`, `log2(pseudocell MT%)` and `log2(pseudocell nUMI)` as fixed effects and `subject id` as a random effect. 


### **Parameters:**

- `metadata`: Path to metadata. It must include a `pathology.group` column with unique groups being `no`, `early`, and `late`.

- `map_meta`: Indicates if metadata mapping is required for `pathology.group`. If set to False, `pathology.group` must exist in `adata.obs`

- `test_names`: List of the different test names of interest.

- `save_prefix`: Preferred prefix for saving critical files. Ideally chosen to be in the format `{source name}_{brain region}`. e.g `mathys_pfc`

- `subject_id`: Column name for Subject/Patient ID in both metadata and `.obs`

- `deg_methods_to_run`: List of methods for the differential expression analysis. Options include: `Trend`, `Voom`, `VoomSampleWeights`, `Dream`. **default: `Trend`**

- `covariates`: List of covariates, including (`including pathology.group`, ) e.g `Sex`, `Sample Batch`, and `Age`. Covariates can be factors that are not of primary interest but might have an effect on the pathological status. For no additional confounders, set `covaraites = ['pathology.group']`.

    - `Note`: Ensure that continuous covariates are similarly scaled. For instance, %MT should range between 0-100, and nGene/nUMI values should undergo a log2() transformation.

    - `Standardization`: Maintain uniformity in categorical data. E.g., Avoid having 'M', 'Male', and 'male' in the same dataset.

    - `Impact Assessment`: Regularly assess the impact of the covariates included the DE results by excluding them one by one and checking how the DE patterns change. If a covariate unduly influences results, exercise caution.

- `filter_genes`: Setermines if genes should be filtered using `gene_celltype_threshold` before DE tests.

- `jack_kniffing`: Logical whether to perform Jack-knifing esampling technique. `This technique is most usefull when we are dealing with large datasets with +10 controls and +10 experiment conditions.` Therefore, the parameter is automatically set to `False if number of subjects is less than or equal to 10`.


### **Parameters To Main DE Function:**

The primary differential expression analysis function accepts:


- `inputExpData`: Input (aggregated) SingleCellExperiment data format.

- `covariates`: Column names for covariates, including the main effect. 

- `randomEffect`: Column name for the random effect (in this case `subject_id`). For pseudobulk data, set as NULL.

- `DEmethod`: Method used for the differential expression analysis. Options: `Trend`, `Voom`, `VoomSampleWeights`, `Dream`. **default: `Trend`**

- `normalization`: Normalization method. Options: `CPM`, `TMM`, `VST`, `rmTop50`, and `none`. **default: `CPM`**

- `quantile.norm`:Specifies quantile normalization on normalized data. **Default is F**. Helpful if highly expressed genes, like `MALAT1`, are being identified as differentially expressed.

- `bkg_genes`: Background genes (filtered) used for differntial expression testing. `usually set to genes expressed in >1% of cells`. Can be provided to substantially speed up re-runs (in case exploring effect of different co-variates on DE genes). default: NULL

    - `Note`: Scrutinize the count of background genes in the analysis. Overly conservative settings might skip many DE genes, while overly liberal ones could introduce noise. 

In [9]:
map_meta = True
filter_genes = "TRUE"

subject_ids_for_study = {'allen_mtg': 'individualID',
                        'leng_sfg': 'PatientID',
                        'leng_etc': 'PatientID',
                        'seaad_mtg': 'Donor ID', 
                        'gazestani_pfc': 'individualID'}

covaraites_for_study = {'allen_mtg': ['pathology.group', 'QC_Gene_total_log', 'QC_MT.pct'], # ['ageDeath.cat', 'sex_y',],
                        'leng_sfg': ['pathology.group', 'QC_Gene_total_log', 'QC_MT.pct'],  # ['ageDeath.cat',],
                        'leng_etc': ['pathology.group', 'QC_Gene_total_log', 'QC_MT.pct'],  # ['ageDeath.cat',]
                        'seaad_mtg': ['pathology.group', 'QC_Gene_total_log', 'QC_MT.pct'],
                        'gazestani_pfc': ['pathology.group', 'Gender', 'APOE', 'QC_Gene_total_log', 'QC_MT.pct']}

subject_id = subject_ids_for_study[save_prefix]    # for leng this is `PatientID` for mathys is 'Subject', and allen is 'individualID'
gene_celltype_threshold = 0.01                     # determines number of cells the gene must be expressed in 
covariates = covaraites_for_study[save_prefix]     # list of covariates to be accounted for in regression.

test_names = ['early_vs_no', 'late_vs_early', 'late_vs_no', 'ad_vs_no']

metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv' 


######################### Differential expression arguments #########################

pseudobulking_strategies = ['network', 'random', 'bulk'] # options 'network', 'random', 'smaller_network', 'bulk'
deg_methods_to_run = ["Trend"] # options "Voom", "VoomSampleWeights", "Dream" default: "Trend"
normalization_methods = ["CPM"] # options: "TMM", "VST", "rmTop50", "none" default: "CPM"
randomEffect = subject_id
covariates = covaraites_for_study[save_prefix]
jack_knifing = False # False if adata_annot.obs[subject_id].nunique()<=10 else True


## **Differential Gene Expression Analysis with Pseudocell approach [Gazestani. et. al. 2023](https://www.sciencedirect.com/science/article/pii/S0092867423008590?via%3Dihub)**

Most existing approaches use either a cell-based or pseudobulk-based approach to identify differentially expressed genes. The issue with the cell-based approaches is that they take a long time to run and are sensitive to the drop-out issue in the single cell data. The pseudobulk methods overcome both these limitations by aggregating the expression of cells from the same cell type and individual into one (pseudo) bulk sample. The issue that raises due to this procedure is the loss of statistical power. As illustration, consider the case where we have sampled 10 cells from a cell type in individual A and 10,000 cells from the same cell type in idividual B. In pseudobulk method, both these individuals would have one pseudobulk sample for this cell type. However, the confidence that we have on the gene expression patterns from individual B is much higher than the confidence that we have in gene expression patterns from individual A; and pseudobulk methods are blind to this. It doesn't matter if you sample 10 cells from a cell type or 10k cellsin pseudobulk samples, it simply ignores this while in reallity they are very different on the level of confidence that they provide.

To address these challenges we use a pseudocell approach. This method is in between the cell-based and pseudobulk-based approaches, so it can take advantage of each method while remedying the issues attributed to them.
The first step for a pseudocell based differential expression analysis is generation of the pseudocells from the single cell expression data. Each pseudocell is usually defined as a combination of 10 to 50 cells. Pseudocells are calucalted by summing the raw UMI count matrices. There are two main functions in scOnline for the construction of pseudocells:


- `To generate pseudocells that are composed of, on average, 20 cells or higher. This function can also generate pseudobulk samples.`

- `To generate pseudocells that are on average composed of 10 cells`

We start with generating pseudocells using function

## **Map Metadata**

In [10]:
meta = pd.read_csv(metadata, encoding_errors='ignore')
meta = meta.astype(str)
mapping = dict(zip(meta[subject_id], meta['pathology.group']))
adata_annot.obs['pathology.group'] = adata_annot.obs[subject_id].map(mapping)

if " " in subject_id:
    subject_id2  = "".join(subject_id.split(" "))
    adata_annot.obs[subject_id2] = adata_annot.obs[subject_id].copy()
    del adata_annot.obs[subject_id]
    subject_id = subject_id2
    randomEffect = subject_id

for covariate in covariates:
    try:
        mapping = dict(zip(meta[subject_id], meta[covariate]))
        adata_annot.obs[covariate] = adata_annot.obs[subject_id].map(mapping)
    except KeyError:
        continue

for obs in adata_annot.obs.columns:
    if obs not in covariates + [subject_id, 'cell_type']:
        del adata_annot.obs[obs]

In [11]:
adata_annot

AnnData object with n_obs × n_vars = 34867 × 38199
    obs: 'pathology.group', 'cell_type', 'individualID', 'Gender', 'APOE'
    layers: 'counts'

## **Loading data into memory**

In [12]:
%%R -i adata_annot -i celltypes

print(adata_annot)

print('loaded data into memory for recursive use')



class: SingleCellExperiment 
dim: 38199 34867 
metadata(0):
assays(2): X counts
rownames(38199): A1BG A1BG-AS1 ... snoZ5 snosnR66
rowData names(0):
colnames(34867): human_iNPH_1020Y1_TACCGGGGTAGGACCA-1
  human_iNPH_1020Y1_ACTATCTCAATCTGCA-1 ...
  human_iNPH_1074D_TTTGGAGTCAAATGCC-1
  human_iNPH_1074D_TCCTAATCATAGATCC-1
colData names(5): pathology.group cell_type individualID Gender APOE
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
[1] "loaded data into memory for recursive use"


#### **Pseudoc-cell Differential Expression Analysis with custom scripts adapted from [Gazestani. et. al. 2023](https://www.sciencedirect.com/science/article/pii/S0092867423008590?via%3Dihub)**

#### **Pseduocell with filtering and QC**


The main arguments for running the function:

- `parsing.col.names`: a vector of column names to be used to parse the data. Usually the columns related to the donor id and the cell type annotation (see above figure).

- `pseudocell.size`: the average size of pseudcoells. If NULL, generates pseudobulk data.

- `inputExpData`: Input single cell data object in format.

- `min_size_limit`: minimum acceptable size of the pseudocells. usually 10 or 15.

- `inputPhenoData`: the meta data on the cells that matches in order with the input PC space to use to create the similarity network. It's highly recommended to be used if. defualt: NULL.

- `nPCs`: Number of pcs to use to create the similarity network. Used only if human or mouse

- `rand_pseudobulk_mod`: To randomly assign cells to pseudocells in each cell type from each individual; or create a similarity network between these cells and use that similarity net to group cells with highly similar expression patterns to each other. `default: TRUE, but Here we set to FALSE``


The authors reiterate that it's highly recommended to provide if the function will generate the embedding for each cell type in each individual. it's possible that such embedding is driven by the quality of the cells in the group opposed to biological variation, and hence quality of the pseudocells and inference on them would be limited.

In [13]:
pseudocell_size = 30    # the average size of pseudcoells. If NULL, generates pseudobulk data.
min_size_limit = 15     # minimum acceptable size of the pseudocells. usually 10 or 15.
nPCs = 30               # Number of pcs to use to create the similarity network. Used only if human or mouse
organism = 'Human' 

In [14]:
del adata_annot

In [15]:
%%R -i covariates -i test_names -i subject_id -i pseudocell_size -i min_size_limit -i nPCs -i organism -i pseudobulking_strategies

library(scuttle)
library(Matrix)
library(ensembldb)
library(EnsDb.Hsapiens.v86)

source("../scripts/functions/deg_functions/sconline_code.R")

#generating the embedding space

exp_seurat = .extraExport2SeuratFn(adata_annot) %>%
Seurat::NormalizeData() %>%
    FindVariableFeatures() %>% 
    ScaleData() %>% 
    RunPCA(verbose=F)

embedding_data = exp_seurat@reductions$pca@cell.embeddings[,1:30]

# generating the pseudocells

pseudobulk_strategy <- list()

if ('network' %in% pseudobulking_strategies){
  #Function adds/modifies three annotation columns: pseuodcell_size, QC_Gene_total_count, QC_Gene_unique_count
  #QC_Gene_total_count: equivalant to nUMI for the pseudobulk samples
  #QC_Gene_unique_count: equivalant to nGene for the pseudobulk samples
  #use scale(tst$QC_Gene_total_count) and scale(tst$pseudocell_size) as additional covariates for the DE analysis

  pseudobulk_strategy[['network']] = suppressWarnings(.sconline.PseudobulkGeneration(argList = NULL, 
                                  # The columns in the pheonData that will be used to parse the expression data 
                                  # and generate the pseudocell/pseudobulk data
                                    parsing.col.names = c(subject_id, 'cell_type'), 
                                  # average pseudocell size.
                                    pseudocell.size = pseudocell_size,
                                    inputExpData = adata_annot,
                                  # minimum acceptable size (ie, #cells) for each pseudobul
                                    min_size_limit = min_size_limit,
                                  # in case we want to run the function outside sconline space
                                    inputPhenoData = as.data.frame(colData(adata_annot)),
                                  # the embedding space to be used for the generation of the pseudobulk.
                                  # only needed when pseudocell.size is not null
                                    inputEmbedding = embedding_data, 
                                  # the dimension of the embedding space for the construction of pseudobulk data
                                    nPCs = nPCs, 
                                    ncores = 5,
                                    rand_pseudobulk_mod = F,
                                  # used to identify and estimate Mitochondrial Genes
                                    organism = organism))
} else {
  pseudobulk_strategy[['network']] <- NULL
}

Main Functions:
.myRead10X()
.myRead10X_h5()
.myLigerToExpSet()
.mycBindFn()
.myExpSetCreatorFn()
.myIntegrative_oneline()
.myFindAllMarkers()
.myAnnotateFn()
.my2dPlot()
.myPseudoCellfn_v2()
.myLabelTransfer_harmony()
.myLabelTransfer_liger()
.myMapToHuman()
.myRiverPlotFn()
.myClusteringOptimizerFn()
.myMarkerBasedAnalysisFn()
.mycellAssignHeatmap()
.myMetaMarkerFn()
.myFindNeighbors()
.myVlnPlot()
.myFeaturePlot()
.myheatmap.3()
.myEvalMarkers()
.myReadGMT()
.mySplitObject()
.myRTNgsea(two-sided GSEA implementation)
[1] "batch information is in the anno_batch variable"
[1] "Number of MT genes in the dataset: 30 / 13"


  displaypub.publish_display_data(


Loading required package: GenomicFeatures
Loading required package: AnnotationDbi

Attaching package: ‘AnnotationDbi’

The following object is masked from ‘package:dplyr’:

    select

Loading required package: AnnotationFilter

Attaching package: ‘AnnotationFilter’

The following object is masked from ‘package:magrittr’:

    not


Attaching package: 'ensembldb'

The following object is masked from 'package:dplyr':

    filter

The following object is masked from 'package:stats':

    filter

------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------

Attaching package: 'plyr'

The following object is masked from 'package:IRanges':

    desc

The following object is masked from 'package:S4Vectors':

    rena

#### **Pseudocells with random cell assignment**


Alternatively, we can create pseudocells using a method that randomly assign cells to pseudocells in each cell type from each individual by setting `rand_pseudobulk_mod=TRUE`. Rather than the previous method which creates a similarity network between these cells and uses that similarity network to group cells with highly similar expression patterns to each other.

In [16]:
%%R 

library(scuttle)
library(Matrix)
library(ensembldb)
library(EnsDb.Hsapiens.v86)

#generating the embedding space

if ('random' %in% pseudobulking_strategies){

  pseudobulk_strategy[['random']] = suppressWarnings(.sconline.PseudobulkGeneration(argList=NULL, 
                                  # The columns in the pheonData that will be used to parse the expression data 
                                  # and generate the pseudocell/pseudobulk data
                                    parsing.col.names = c(subject_id, 'cell_type'), 
                                  # average pseudocell size.
                                    pseudocell.size = pseudocell_size,
                                    inputExpData = adata_annot,
                                  # minimum acceptable size (ie, #cells) for each pseudobul
                                    min_size_limit = min_size_limit,
                                  # in case we want to run the function outside sconline space
                                    inputPhenoData = as.data.frame(colData(adata_annot)),
                                    ncores = 5,
                                    rand_pseudobulk_mod = T,
                                  # used to identify and estimate Mitochondrial Genes
                                    organism = organism))
} else {
  pseudobulk_strategy[['random']] <- list()
}

[1] "batch information is in the anno_batch variable"
[1] "Number of MT genes in the dataset: 30 / 13"


#### **Pseudocells with smaller size**

We can also generate pseudocells using that allows us to have pseudocells of smaller size. The main arguments for this function:

- `inputExpData`: Input single cell data object in SingleCellExperiment format
- `embeddings`: Input PC space to use to create the similarity network. It can't be set to NULL
- `pseudobulk_split_col`: The column name to parse the single cell data.
- `min_dataset_size`: Minimum number of acceptable pseudocell size. default: 4.

In [17]:
%%R 

library(scuttle)
library(Matrix)
library(ensembldb)
library(EnsDb.Hsapiens.v86)


#generating the embedding space

# exp_seurat = .extraExport2SeuratFn(adata_annot) %>%
# Seurat::NormalizeData() %>%
#     FindVariableFeatures() %>% 
#     ScaleData() %>% 
#     RunPCA(verbose=F)

# embedding_data = exp_seurat@reductions$pca@cell.embeddings[,1:30]

adata_annot$lib_anno = paste0(adata_annot[[subject_id]], "_", adata_annot$cell_type)

if ('smaller network' %in% pseudobulking_strategies){

    #creates pseudobulk samples of median size 10
    pseudobulk_strategy[['smaller network']] = .sconline.Pseudobulk10(inputExpData=adata_annot,
                                        embeddings = embedding_data,
                                        pseudobulk_split_col = "lib_anno",
                                        min_dataset_size = 4,
                                        organism = organism)
} else {
    pseudobulk_strategy[['smaller network']] <- NULL
}


#### **Using Standard Pseudobulk Approach**

To benchmark the results, here we also create pseudobulk data. To create pseudobulk data per donor and cluster, we can use the function agian, but setting `pseudocell.size = NULL`

In [18]:
%%R 

library(scuttle)
library(Matrix)
library(ensembldb)
library(EnsDb.Hsapiens.v86)

#generating the embedding space

exp_seurat = .extraExport2SeuratFn(adata_annot) %>%
Seurat::NormalizeData() %>%
    FindVariableFeatures() %>% 
    ScaleData() %>% 
    RunPCA(verbose=F)

embedding_data = exp_seurat@reductions$pca@cell.embeddings[,1:30]

if ('bulk' %in% pseudobulking_strategies){

  pseudobulk_strategy[['bulk']] = suppressWarnings(.sconline.PseudobulkGeneration(argList = NULL, 
                                  # The columns in the pheonData that will be used to parse the expression data 
                                  # and generate the pseudocell/pseudobulk data
                                    parsing.col.names = c(subject_id, 'cell_type'), 
                                  # average pseudocell size.
                                    pseudocell.size = NULL,
                                    inputExpData = adata_annot,
                                  # minimum acceptable size (ie, #cells) for each pseudobul
                                    min_size_limit = min_size_limit,
                                  # in case we want to run the function outside sconline space
                                    inputPhenoData = as.data.frame(colData(adata_annot)),
                                  # the embedding space to be used for the generation of the pseudobulk.
                                  # only needed when pseudocell.size is not null
                                    inputEmbedding = embedding_data, 
                                  # the dimension of the embedding space for the construction of pseudobulk data
                                    nPCs = nPCs, 
                                    ncores = 3,
                                    rand_pseudobulk_mod = F,
                                    organism = organism))

} else {
  pseudobulk_strategy[['bulk']] <- NULL
}


[1] "Number of MT genes in the dataset: 30 / 13"


Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Centering and scaling data matrix
Loading required package: caret
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:future':

    cluster

The following object is masked from 'package:purrr':

    lift



### **DEG Analysis**


The main function for differential expression analysis is with following input arguments:

- `inputExpData`: Input (aggregated) single cell data object in SingleCellExperiment format

- `covariates`: The column names for covariates to be included in the analysis including the main effect

    -  This should be a list of covariates (`including pathology.group`, ) e.g `Sex`, `Sample Batch`, `Age` and other factors that might not be of interest but might have an effect on the pathological status. If you would not like to include any additional cofounders, please set `covaraites = ['pathology.group']`.

    - `Note 1`: ensure continuous covariates are in similar scales. For example, scale %MT to be in the range of 0-100 and log2() transform nGene/nUMI values.

    - `Note 2`: make sure categorical data are standadized in format. e.g., you don't have M, Male, and male in the same dataset. All should be M or male, or Male (it's case sensitive).

    - `Note 3`: always assess the impact of the covariates that you are including on the DE results by excluding them one by one and checking how the DE patterns change. You should be cautious if one of them is having out-sized effect on the results

- `randomEffect`: The column name for the random effect (it is donor id in most scenarios). Set to NULL for pseudobulk data.

- `DEmethod`: Method used for the differential expression analysis. Options: `Trend`, `Voom`, `VoomSampleWeights`, `Dream`. **default: `Trend`**

- `normalization`: Normalization method. Options: `CPM`, `TMM`, `VST`, `rmTop50`, and `none`. **default: `CPM`**

- `quantile.norm`: Logical, indicating whether or not to do quantile normalization on the normalized data. **default: F.** Specially useful if the highly expressed genes like `MALAT1` are being identified as differentially expressed.

- `bkg_genes`: Background genes used for differntial expression testing. usually set to genes expressed in >1% of cells. Can be provided to substantially speed up re-runs (in case exploring effect of different co-variates on DE genes). default: NULL

    - `Note 4`: check the number of background genes that are being included in the analysis. If it's too conservative, you would lose many of DE genes (as they are not being tested) and if it's too liberal, noise would be added to the results

We want to do the DE analysis on each cell-type individually. So first we need to split the pseudocell and pseudobulk samples based on the cluster annotations

In [19]:
%%R -o pseudobulk_strategy_list

#Transforming the nGene and nUMI
pseudobulk_strategy_list <- list()

for (strategy in pseudobulking_strategies){
    pseudobulk_strategy[[strategy]]$QC_Gene_total_log=log2(pseudobulk_strategy[[strategy]]$QC_Gene_total_count)
    pseudobulk_strategy[[strategy]]$QC_Gene_unique_log=log2(pseudobulk_strategy[[strategy]]$QC_Gene_unique_count)

    #spliting the objects based on the cluster annotations
    pseudobulk_strategy_list[[strategy]] = .mySplitObject(pseudobulk_strategy[[strategy]], 'cell_type')

}




In [20]:
for strategy in pseudobulking_strategies:
    for cell_type in celltypes:
        try:
            pseudobulk_strategy_list[strategy][cell_type].layers['counts'] = pseudobulk_strategy_list[strategy][cell_type].X.copy()
        except KeyError:
            continue

Next, we perform DE analysis using specified method's in limma

In [21]:
# convert nested list of Seurat object into Rpy2 object 

pseudobulk_strategy_list =  robjects.ListVector(
                                    {
                                        strategy: robjects.ListVector(
                                                    {
                                                        cell_type: pseudobulk_strategy_list[strategy][cell_type]
                                                
                                                        for cell_type in pseudobulk_strategy_list[strategy].keys()
                                                    }
                                                )
                                        for strategy in pseudobulking_strategies
                                    }    
                                )

#### **Estimating DEGS in Pseudocells with Network Aggregation**

In [22]:
%%R -i pseudobulk_strategy_list -i covariates -i filter_genes -i gene_celltype_threshold -i deg_methods_to_run -i normalization_methods -i randomEffect

source("../scripts/functions/deg_functions/sconline_code.R")

########### Setting the parameters ################

# The column names for covariates to be included in the analysis including the main effect
covList = covariates
# background genes used for differntial expression testing. 
# usually set to genes expressed in >1% of cells. default: NULL
if (filter_genes){
    bkg_gene_pct_thr = gene_celltype_threshold # selecting genes that are expressed in > 10% of cells
    bkg_gene_count_thr = 10
} else{
    bkg_gene_pct_thr = NULL
}
# The column name for the random effect (it is donor id in most scenarios).
# Set to NULL for pseudobulk data
rand_var = randomEffect
# whether or not to do quantile normalization on the normalized data. default: F. 
# Specially useful if the highly expressed genes like MALAT1 are being identified as differentially expressed.
quantile_norm = T

DE_results <- list()

if ('network' %in% pseudobulking_strategies){

  DE_results[['network']] <- list()

  for (de_method in deg_methods_to_run){

    DE_results[['network']][[de_method]] <- list()

    for (norm_method in normalization_methods){

      print(paste0('Estimating DEGs using ', de_method))
      #DE res on pseudobulk_strategy[['random']] per cluster
      results = lapply(names(pseudobulk_strategy_list[['network']]),function(x_name){

          x = pseudobulk_strategy_list[['network']][[x_name]]
          res_arranged = NULL

          #requiring existence of at least 6 pseudocells for the DE analysis
          if(length(unique(x$pathology.group))>1&ncol(x)>5){
            print(paste0("Calculating DE genes for ", x_name))
            
            num_early = table(x$pathology.group)['early']
            num_late = table(x$pathology.group)['late']
            num_no = table(x$pathology.group)['no']

            frac_early = num_early/(num_early + num_late)
            frac_late = num_late/(num_early + num_late)

            if(!is.null(bkg_gene_pct_thr)){
                #background genes should be selectd based on the cell level expression data
              tmp_bkg_genes = counts(adata_annot)[,which(adata_annot$cell_type==x_name)]
              tmp_bkg_genes_counts=rowSums(tmp_bkg_genes>0)
              tmp_bkg_genes_frac=tmp_bkg_genes_counts/sum(adata_annot$cell_type==x_name)
              tmp_bkg_genes=row.names(adata_annot)[tmp_bkg_genes_frac>=bkg_gene_pct_thr&tmp_bkg_genes_counts>=bkg_gene_count_thr]
              
            } else {
              tmp_bkg_genes=NULL
            }
    
            #inputExpData=x;covariates=covList;randomEffect=rand_effect;bkg_genes=tmp_bkg_genes;quantile.norm=quantile_norm;prior.count=1
            res =.sconline.fitLimmaFn(inputExpData=x,
                                    covariates=covList,
                                    randomEffect = rand_var,
                                    bkg_genes = tmp_bkg_genes,
                                    quantile.norm = quantile_norm,
                                    prior.count = 1,
                                    DEmethod = de_method,
                                    normalization = norm_method)
            #check the dc object, the usual consensus.correlation that I get is in the range of ~0.2 or above if rand=T
            
            
            if(sum(grepl("pathology.group", colnames(res$model)))>0){
              #Defining the comparisosn that we are interested in to know it's logFC and pval
              # Now, use this in makeContrasts
              contr <- makeContrasts(
                  pathology.groupearly - pathology.groupno,
                  pathology.grouplate - pathology.groupno,
                  pathology.grouplate - pathology.groupearly,
                  (pathology.groupearly + pathology.grouplate)/2 - pathology.groupno,
                  levels = res$model
                  )
              
              fit2=contrasts.fit(res$fit, contrasts=contr)
              #Explore setting robust=F
              fit2=eBayes(fit2,robust = T,trend=T)
              #running topTable for each contrast
              res_early=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.groupearly - pathology.groupno");
              res_late=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupno");
              res_lve=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupearly");
              res_ad=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="(pathology.groupearly + pathology.grouplate)/2 - pathology.groupno");
              
              res_early$gene=row.names(res_early)
              res_late$gene=row.names(res_early)
              res_ad$gene=row.names(res_ad)
              res_lve$gene=row.names(res_lve)

              colnames(res_early)=paste0("early.",colnames(res_early))
              colnames(res_late)=paste0("late.",colnames(res_late))
              colnames(res_ad)=paste0("ad.",colnames(res_ad))
              colnames(res_lve)=paste0("lve.",colnames(res_lve))

              # Merge res_early and res_late
              merged_early_late <- merge(res_early, res_late, by.x="early.gene", by.y="late.gene", all=TRUE)
              merged_lve <- merge(merged_early_late, res_lve, by.x="early.gene", by.y="lve.gene", all=TRUE)
              merged_all <- merge(merged_lve, res_ad, by.x="early.gene", by.y="ad.gene", all=TRUE)
              
              res_arranged <- merged_all

              res_arranged$blocked_analysis=res$blocked_analysis
              res_arranged$block.cor=res$dc$consensus.correlation #check the dc object, the usual consensus.correlation range is ~0.2 or above if rand=T
              res_arranged$cell_type=unique(as.character(x$cell_type))
              colnames(res_arranged)[colnames(res_arranged)=="early.gene"]="gene"
            }
          }
        return(res_arranged)
      })

      # Naming the list entries
      named_results <- setNames(results, names(pseudobulk_strategy_list[['network']]))

      # Storing the named results in DE_results[['network']]
      DE_results[['network']][[de_method]][[norm_method]] <- named_results

    }

    print('............................')
    print('............................')

  }

} else {
  DE_results[['network']] <- NULL
}


Main Functions:
.myRead10X()
.myRead10X_h5()
.myLigerToExpSet()
.mycBindFn()
.myExpSetCreatorFn()
.myIntegrative_oneline()
.myFindAllMarkers()
.myAnnotateFn()
.my2dPlot()
.myPseudoCellfn_v2()
.myLabelTransfer_harmony()
.myLabelTransfer_liger()
.myMapToHuman()
.myRiverPlotFn()
.myClusteringOptimizerFn()
.myMarkerBasedAnalysisFn()
.mycellAssignHeatmap()
.myMetaMarkerFn()
.myFindNeighbors()
.myVlnPlot()
.myFeaturePlot()
.myheatmap.3()
.myEvalMarkers()
.myReadGMT()
.mySplitObject()
.myRTNgsea(two-sided GSEA implementation)
[1] "Estimating DEGs using Trend"
[1] "Calculating DE genes for OPC"
[1] "Number of expressed genes: 17558"
[1] "............................"
[1] "............................"


#### **Estimating DEGS in Pseudocells with Random Aggregation**

In [23]:
%%R 

########### Setting the parameters ################

# The column names for covariates to be included in the analysis including the main effect
covList = covariates
# background genes used for differntial expression testing. 
# usually set to genes expressed in >1% of cells. default: NULL
if (filter_genes){
    bkg_gene_pct_thr = gene_celltype_threshold # selecting genes that are expressed in > 10% of cells
    bkg_gene_count_thr = 10
} else{
    bkg_gene_pct_thr = NULL
}
# The column name for the random effect (it is donor id in most scenarios).
# Set to NULL for pseudobulk data
rand_var = randomEffect
# whether or not to do quantile normalization on the normalized data. default: F. 
# Specially useful if the highly expressed genes like MALAT1 are being identified as differentially expressed.
quantile_norm = T

if ('random' %in% pseudobulking_strategies){

  for (de_method in deg_methods_to_run){

    DE_results[['random']][[de_method]] <- list()

    for (norm_method in normalization_methods){

      print(paste0('Estimating DEGs using ', de_method))
      #DE res on pseudobulk_strategy[['random']] per cluster
      results = lapply(names(pseudobulk_strategy_list[['random']]),function(x_name){

          x = pseudobulk_strategy_list[['random']][[x_name]]
          res_arranged = NULL

          #requiring existence of at least 6 pseudocells for the DE analysis
          if(length(unique(x$pathology.group))>1&ncol(x)>5){
            print(paste0("Calculating DE genes for ", x_name))
            
            num_early = table(x$pathology.group)['early']
            num_late = table(x$pathology.group)['late']
            num_no = table(x$pathology.group)['no']

            frac_early = num_early/(num_early + num_late)
            frac_late = num_late/(num_early + num_late)

            if(!is.null(bkg_gene_pct_thr)){
                #background genes should be selectd based on the cell level expression data
              tmp_bkg_genes = counts(adata_annot)[,which(adata_annot$cell_type==x_name)]
              tmp_bkg_genes_counts=rowSums(tmp_bkg_genes>0)
              tmp_bkg_genes_frac=tmp_bkg_genes_counts/sum(adata_annot$cell_type==x_name)
              tmp_bkg_genes=row.names(adata_annot)[tmp_bkg_genes_frac>=bkg_gene_pct_thr&tmp_bkg_genes_counts>=bkg_gene_count_thr]
              
            } else {
              tmp_bkg_genes=NULL
            }
    

            #inputExpData=x;covariates=covList;randomEffect=rand_effect;bkg_genes=tmp_bkg_genes;quantile.norm=quantile_norm;prior.count=1
            res =.sconline.fitLimmaFn(inputExpData=x,
                                    covariates=covList,
                                    randomEffect = rand_var,
                                    bkg_genes = tmp_bkg_genes,
                                    quantile.norm = quantile_norm,
                                    prior.count = 1,
                                    DEmethod = de_method,
                                    normalization = norm_method)
            #check the dc object, the usual consensus.correlation that I get is in the range of ~0.2 or above if rand=T
            
            
            if(sum(grepl("pathology.group", colnames(res$model)))>0){
              #Defining the comparisosn that we are interested in to know it's logFC and pval
              # Now, use this in makeContrasts
              contr <- makeContrasts(
                  pathology.groupearly - pathology.groupno,
                  pathology.grouplate - pathology.groupno,
                  pathology.grouplate - pathology.groupearly,
                  (pathology.groupearly + pathology.grouplate)/2 - pathology.groupno,
                  levels = res$model
                  )
              
              fit2=contrasts.fit(res$fit, contrasts=contr)
              #Explore setting robust=F
              fit2=eBayes(fit2,robust = T,trend=T)
              #running topTable for each contrast
              res_early=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.groupearly - pathology.groupno");
              res_late=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupno");
              res_lve=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupearly");
              res_ad=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="(pathology.groupearly + pathology.grouplate)/2 - pathology.groupno");
              
              res_early$gene=row.names(res_early)
              res_late$gene=row.names(res_early)
              res_ad$gene=row.names(res_ad)
              res_lve$gene=row.names(res_lve)

              colnames(res_early)=paste0("early.",colnames(res_early))
              colnames(res_late)=paste0("late.",colnames(res_late))
              colnames(res_ad)=paste0("ad.",colnames(res_ad))
              colnames(res_lve)=paste0("lve.",colnames(res_lve))

              # Merge res_early and res_late
              merged_early_late <- merge(res_early, res_late, by.x="early.gene", by.y="late.gene", all=TRUE)
              merged_lve <- merge(merged_early_late, res_lve, by.x="early.gene", by.y="lve.gene", all=TRUE)
              merged_all <- merge(merged_lve, res_ad, by.x="early.gene", by.y="ad.gene", all=TRUE)
              
              res_arranged <- merged_all

              res_arranged$blocked_analysis=res$blocked_analysis
              res_arranged$block.cor=res$dc$consensus.correlation #check the dc object, the usual consensus.correlation range is ~0.2 or above if rand=T
              res_arranged$cell_type=unique(as.character(x$cell_type))
              colnames(res_arranged)[colnames(res_arranged)=="early.gene"]="gene"
            }
            
          }
        return(res_arranged)
      })

      # Naming the list entries
      named_results <- setNames(results, names(pseudobulk_strategy_list[['random']]))

      # Storing the named results in DE_results[['network']]
      DE_results[['random']][[de_method]][[norm_method]] <- named_results
    }

    print('............................')
    print('............................')

  }

} else{

DE_results[['random']] <- NULL

}

[1] "Estimating DEGs using Trend"
[1] "Calculating DE genes for OPC"
[1] "Number of expressed genes: 17558"
[1] "............................"
[1] "............................"


#### **Estimating DEGS in Pseudocells with Smaller Size**

In [24]:
%%R

########### Setting the parameters ################

# The column names for covariates to be included in the analysis including the main effect
covList = covariates
# background genes used for differntial expression testing. 
# usually set to genes expressed in >1% of cells. default: NULL
if (filter_genes){
    bkg_gene_pct_thr = gene_celltype_threshold # selecting genes that are expressed in > 10% of cells
    bkg_gene_count_thr = 10
} else{
    bkg_gene_pct_thr = NULL
}
# The column name for the random effect (it is donor id in most scenarios).
# Set to NULL for pseudobulk data
rand_var = randomEffect
# whether or not to do quantile normalization on the normalized data. default: F. 
# Specially useful if the highly expressed genes like MALAT1 are being identified as differentially expressed.
quantile_norm = T


if ('smaller network' %in% pseudobulking_strategies){

  for (de_method in deg_methods_to_run){

    DE_results[['smaller network']][[de_method]] <- list()

    for (norm_method in normalization_methods){

      print(paste0('Estimating DEGs using ', de_method))
      #DE res on pseudobulk_strategy[['random']] per cluster
      results = lapply(names(pseudobulk_strategy_list[['smaller network']]),function(x_name){

          x = pseudobulk_strategy_list[['smaller network']][[x_name]]
          res_arranged = NULL

          #requiring existence of at least 6 pseudocells for the DE analysis
          if(length(unique(x$pathology.group))>1&ncol(x)>5){
            print(paste0("Calculating DE genes for ", x_name))
            
            num_early = table(x$pathology.group)['early']
            num_late = table(x$pathology.group)['late']
            num_no = table(x$pathology.group)['no']

            frac_early = num_early/(num_early + num_late)
            frac_late = num_late/(num_early + num_late)

            if(!is.null(bkg_gene_pct_thr)){
                #background genes should be selectd based on the cell level expression data
              tmp_bkg_genes = counts(adata_annot)[,which(adata_annot$cell_type==x_name)]
              tmp_bkg_genes_counts=rowSums(tmp_bkg_genes>0)
              tmp_bkg_genes_frac=tmp_bkg_genes_counts/sum(adata_annot$cell_type==x_name)
              tmp_bkg_genes=row.names(adata_annot)[tmp_bkg_genes_frac>=bkg_gene_pct_thr&tmp_bkg_genes_counts>=bkg_gene_count_thr]
              
            } else {
              tmp_bkg_genes=NULL
            }
    

            #inputExpData=x;covariates=covList;randomEffect=rand_effect;bkg_genes=tmp_bkg_genes;quantile.norm=quantile_norm;prior.count=1
            res =.sconline.fitLimmaFn(inputExpData=x,
                                    covariates=covList,
                                    randomEffect = rand_var,
                                    bkg_genes = tmp_bkg_genes,
                                    quantile.norm = quantile_norm,
                                    prior.count = 1,
                                    DEmethod = de_method,
                                    normalization = norm_method)
            #check the dc object, the usual consensus.correlation that I get is in the range of ~0.2 or above if rand=T
            
            
            if(sum(grepl("pathology.group", colnames(res$model)))>0){
              #Defining the comparisosn that we are interested in to know it's logFC and pval
              # Now, use this in makeContrasts
              contr <- makeContrasts(
                  pathology.groupearly - pathology.groupno,
                  pathology.grouplate - pathology.groupno,
                  pathology.grouplate - pathology.groupearly,
                  (pathology.groupearly + pathology.grouplate)/2 - pathology.groupno,
                  levels = res$model
                  )
              
              fit2=contrasts.fit(res$fit, contrasts=contr)
              #Explore setting robust=F
              fit2=eBayes(fit2,robust = T,trend=T)
              #running topTable for each contrast
              res_early=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.groupearly - pathology.groupno");
              res_late=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupno");
              res_lve=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupearly");
              res_ad=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="(pathology.groupearly + pathology.grouplate)/2 - pathology.groupno");
              
              res_early$gene=row.names(res_early)
              res_late$gene=row.names(res_early)
              res_ad$gene=row.names(res_ad)
              res_lve$gene=row.names(res_lve)

              colnames(res_early)=paste0("early.",colnames(res_early))
              colnames(res_late)=paste0("late.",colnames(res_late))
              colnames(res_ad)=paste0("ad.",colnames(res_ad))
              colnames(res_lve)=paste0("lve.",colnames(res_lve))

              # Merge res_early and res_late
              merged_early_late <- merge(res_early, res_late, by.x="early.gene", by.y="late.gene", all=TRUE)
              merged_lve <- merge(merged_early_late, res_lve, by.x="early.gene", by.y="lve.gene", all=TRUE)
              merged_all <- merge(merged_lve, res_ad, by.x="early.gene", by.y="ad.gene", all=TRUE)
              
              res_arranged <- merged_all

              res_arranged$blocked_analysis=res$blocked_analysis
              res_arranged$block.cor=res$dc$consensus.correlation #check the dc object, the usual consensus.correlation range is ~0.2 or above if rand=T
              res_arranged$cell_type=unique(as.character(x$cell_type))
              colnames(res_arranged)[colnames(res_arranged)=="early.gene"]="gene"
            }
          }
        return(res_arranged)
      })

      # Naming the list entries
      named_results <- setNames(results, names(pseudobulk_strategy_list[['smaller network']]))

      # Storing the named results in DE_results[['network']]
      DE_results[['smaller network']][[de_method]][[norm_method]] <- named_results
    }

    print('............................')
    print('............................')
  }
  
} else {

  DE_results[['smaller network']] <- NULL
}

#### **Estimating DEGS in Pseudocells with Standard Pseudobulking**

In [25]:
%%R 

########### Setting the parameters ################

# The column names for covariates to be included in the analysis including the main effect
covList = covariates
# background genes used for differntial expression testing. 
# usually set to genes expressed in >1% of cells. default: NULL
if (filter_genes){
    bkg_gene_pct_thr = gene_celltype_threshold # selecting genes that are expressed in > 10% of cells
    bkg_gene_count_thr = 10
} else{
    bkg_gene_pct_thr = NULL
}
# The column name for the random effect (it is donor id in most scenarios).
# Set to NULL for pseudobulk data
rand_var = randomEffect
# whether or not to do quantile normalization on the normalized data. default: F. 
# Specially useful if the highly expressed genes like MALAT1 are being identified as differentially expressed.
quantile_norm = T



if ('bulk' %in% pseudobulking_strategies){

  for (de_method in deg_methods_to_run){

    DE_results[['bulk']][[de_method]] <- list()

    for (norm_method in normalization_methods){

      print(paste0('Estimating DEGs using ', de_method))
      #DE res on pseudobulk_strategy[['random']] per cluster
      results = lapply(names(pseudobulk_strategy_list[['bulk']]),function(x_name){

          x = pseudobulk_strategy_list[['bulk']][[x_name]]
          res_arranged = NULL

          #requiring existence of at least 6 pseudocells for the DE analysis
          if(length(unique(x$pathology.group))>1&ncol(x)>5){
            print(paste0("Calculating DE genes for ", x_name))
            
            num_early = table(x$pathology.group)['early']
            num_late = table(x$pathology.group)['late']
            num_no = table(x$pathology.group)['no']

            frac_early = num_early/(num_early + num_late)
            frac_late = num_late/(num_early + num_late)

            if(!is.null(bkg_gene_pct_thr)){
                #background genes should be selectd based on the cell level expression data
              tmp_bkg_genes = counts(adata_annot)[,which(adata_annot$cell_type==x_name)]
              tmp_bkg_genes_counts=rowSums(tmp_bkg_genes>0)
              tmp_bkg_genes_frac=tmp_bkg_genes_counts/sum(adata_annot$cell_type==x_name)
              tmp_bkg_genes=row.names(adata_annot)[tmp_bkg_genes_frac>=bkg_gene_pct_thr&tmp_bkg_genes_counts>=bkg_gene_count_thr]
              
            } else {
              tmp_bkg_genes=NULL
            }
            #inputExpData=x;covariates=covList;randomEffect=rand_effect;bkg_genes=tmp_bkg_genes;quantile.norm=quantile_norm;prior.count=1
            res =.sconline.fitLimmaFn(inputExpData=x,
                                    covariates=covList,
                                    randomEffect = NULL,
                                    bkg_genes = tmp_bkg_genes,
                                    quantile.norm = quantile_norm,
                                    prior.count = 1,
                                    DEmethod = de_method,
                                    normalization = norm_method)
            #check the dc object, the usual consensus.correlation that I get is in the range of ~0.2 or above if rand=T
                      
            if(sum(grepl("pathology.group", colnames(res$model)))>0){
              #Defining the comparisosn that we are interested in to know it's logFC and pval
              # Now, use this in makeContrasts
              contr <- makeContrasts(
                  pathology.groupearly - pathology.groupno,
                  pathology.grouplate - pathology.groupno,
                  pathology.grouplate - pathology.groupearly,
                  (pathology.groupearly + pathology.grouplate)/2 - pathology.groupno,
                  levels = res$model
                  )
              
              fit2=contrasts.fit(res$fit, contrasts=contr)
              #Explore setting robust=F
              fit2=eBayes(fit2,robust = T,trend=T)
              #running topTable for each contrast
              res_early=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.groupearly - pathology.groupno");
              res_late=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupno");
              res_lve=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupearly");
              res_ad=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="(pathology.groupearly + pathology.grouplate)/2 - pathology.groupno");
            

              res_early$gene=row.names(res_early)
              res_late$gene=row.names(res_early)
              res_ad$gene=row.names(res_ad)
              res_lve$gene=row.names(res_lve)

              colnames(res_early)=paste0("early.",colnames(res_early))
              colnames(res_late)=paste0("late.",colnames(res_late))
              colnames(res_ad)=paste0("ad.",colnames(res_ad))
              colnames(res_lve)=paste0("lve.",colnames(res_lve))

              # Merge res_early and res_late
              merged_early_late <- merge(res_early, res_late, by.x="early.gene", by.y="late.gene", all=TRUE)
              merged_lve <- merge(merged_early_late, res_lve, by.x="early.gene", by.y="lve.gene", all=TRUE)
              merged_all <- merge(merged_lve, res_ad, by.x="early.gene", by.y="ad.gene", all=TRUE)
              
              res_arranged <- merged_all

              res_arranged$blocked_analysis=res$blocked_analysis
              res_arranged$cell_type=unique(as.character(x$cell_type))
              colnames(res_arranged)[colnames(res_arranged)=="early.gene"]="gene"


            }
          }
        return(res_arranged)
      })

      # Naming the list entries
      named_results <- setNames(results, names(pseudobulk_strategy_list[['bulk']]))

      # Storing the named results in DE_results[['network']]
      DE_results[['bulk']][[de_method]][[norm_method]] <- named_results
    }

    print('............................')
    print('............................')

  }

} else {

  DE_results[['bulk']] <- NULL
}

[1] "Estimating DEGs using Trend"
[1] "Calculating DE genes for OPC"


[1] "Number of expressed genes: 17558"
[1] "............................"
[1] "............................"


### **Robustness analysis**

Robustness analysis provides an orthogonal support for the identified DE genes. In addition to being an independent approach from the DE analysis method, `robustness scores are also not affected by the covariates,` making it a simple, `intuitive way to assess our confidence on the DE genes`. 

Briefly, robustness score is `defined as the fraction of samples in the experiment that show an up or down regulation pattern across control condition`. For example, if we have 4 Abeta donors and 5 Ctrl donors, robustness score for each gene is calculated by comparing cells from each of Abeta donors with cells from each of Ctrl (e.g., 4 x 5 = 20 comparisons). Robustness scores are scaled to be between -1 to 1. score of 1 (-1) indicate a gene is consistently up (down) regulated in every experiment donor compared to every control. Similarly, a robustness score of 0 indicate a gene is randomly up/down regulated across experiment donors.

We perform robustness analysis by using function provided in code by [Gazestani. et. al. 2023](https://www.sciencedirect.com/science/article/pii/S0092867423008590?via%3Dihub). The default robustness score threshold of 0.6 means that a gene is consistently up or down regulated in at least 80% of comparisons. Note that for thresholding on the robustness scores, absolute values should be used. Moreover, since robustness scores are not affected by the DE method, we don't need to repeat it's calculation everytime that we change the pseudocell/pseudobulk strategy or alter the list of covariates. It needs to be calculated only once per dataset.

In [26]:
# %%R

# #A sample function to calculate robustness scores for earl, late, ad test conditions compared to controls
# .robustness_scoresfn = function(input_exp_data){

#   # Filter out columns with NA cell_type values
#   res_robustness=list()
#     input_exp_data=input_exp_data[,!is.na(input_exp_data$cell_type)]

#   # Loop through each unique cell type to compute robustness scores
#   for(iclust in unique(input_exp_data$cell_type)){
#     print(paste0("Calculating robustness for ",iclust))

#     # Extract data for the current cell type
#     tmp_data=input_exp_data[,which(input_exp_data$cell_type==iclust)]
#     # Convert the extracted data to Seurat format and normalize it
#     tmp_data_seurat =.extraExport2SeuratFn(inputData=tmp_data, project="scRNA")
#     tmp_data_seurat = Seurat::NormalizeData(tmp_data_seurat, verbose=F)

#     # If the data contains both "early" and "no" status values, calculate robustness for early
#     if(sum(unique(tmp_data$pathology.group) %in% c("early","no"))==2){
#       robustness_early = .sconline.RobustFC(inputData=tmp_data, 
#                                             batch_col=subject_id, 
#                                             contrast_col="pathology.group", 
#                                             contrast_1="early", 
#                                             contrast_2="no", 
#                                             sex_col=NULL,
#                                             ncores=5,
#                                             groupLevel=F)

#       robustness_early=do.call("rbind",robustness_early)
#       robustness_cell.count.1=aggregate(cell.count.1~gene,data=robustness_early,mean)
#       robustness_FCscore=aggregate(score_logFC~gene,data=robustness_early,mean)
#       robustness_PCTscore=aggregate(score_pct~gene,data=robustness_early,mean)
#       robustness_meanRefCount=aggregate(ref_count~gene,data=robustness_early,mean)
#       robustness_early=merge(robustness_FCscore,robustness_PCTscore,by="gene")
#       robustness_early=merge(robustness_early,robustness_meanRefCount,by="gene")
#       robustness_early=merge(robustness_early,robustness_cell.count.1,by="gene")
#       robustness_early=robustness_early[order(robustness_early$score_pct,decreasing = T),]
#       robustness_early$pathology.group="early"
      
#       tmp_res_seurat=.myEvalMarkers(object=tmp_data_seurat, 
#                                     cells.1=colnames(tmp_data_seurat)[which(tmp_data_seurat@meta.data$pathology.group=="early")],
#                                     cells.2=colnames(tmp_data_seurat)[which(tmp_data_seurat@meta.data$pathology.group=="no")], 
#                                     slot = "data", 
#                                     features = NULL, 
#                                     thresh.min = 0, 
#                                     pseudocount.use = 1,
#                                     cells.1.weight.col=NULL,
#                                     cluster_name=iclust)

#       colnames(tmp_res_seurat)=paste0("seurat_",colnames(tmp_res_seurat))
#       tmp_res_seurat$gene=row.names(tmp_res_seurat)
#       robustness_early=merge(robustness_early,tmp_res_seurat,by="gene",all.x=T)
#       res_robustness=c(res_robustness,list(robustness_early))
      
#     }
    
#     if(sum(unique(tmp_data$pathology.group) %in% c("late","no"))==2){
#       robustness_late=.sconline.RobustFC(inputData=tmp_data,
#                                           batch_col=subject_id,
#                                           contrast_col="pathology.group",
#                                           contrast_1="late",
#                                           contrast_2="no",
#                                           sex_col=NULL,
#                                           ncores=5,
#                                           groupLevel=F)

#       robustness_late=do.call("rbind",robustness_late)
#       robustness_cell.count.1=aggregate(cell.count.1~gene,data=robustness_late,mean)
#       robustness_FCscore=aggregate(score_logFC~gene,data=robustness_late,mean)
#       robustness_PCTscore=aggregate(score_pct~gene,data=robustness_late,mean)
#       robustness_meanRefCount=aggregate(ref_count~gene,data=robustness_late,mean)
#       robustness_late=merge(robustness_FCscore,robustness_PCTscore,by="gene")
#       robustness_late=merge(robustness_late,robustness_meanRefCount,by="gene")
#       robustness_late=merge(robustness_late,robustness_cell.count.1,by="gene")
#       robustness_late=robustness_late[order(robustness_late$score_pct,decreasing = T),]
      
#       robustness_late$pathology.group="late"
      
#       tmp_res_seurat=.myEvalMarkers(object=tmp_data_seurat, 
#                                     cells.1=colnames(tmp_data_seurat)[which(tmp_data_seurat@meta.data$status=="late")],
#                                     cells.2=colnames(tmp_data_seurat)[which(tmp_data_seurat@meta.data$status=="no")], 
#                                     slot = "data", 
#                                     features = NULL,
#                                     thresh.min = 0, 
#                                     pseudocount.use = 1,
#                                     cells.1.weight.col=NULL,
#                                     cluster_name=iclust)
#       colnames(tmp_res_seurat)=paste0("seurat_",colnames(tmp_res_seurat))
#       tmp_res_seurat$gene=row.names(tmp_res_seurat)
#       robustness_late=merge(robustness_late,tmp_res_seurat,by="gene",all.x=T)
#       res_robustness=c(res_robustness,list(robustness_late))
#     }
    
#   }

#   # Rename the columns of the results based on the status
#   res_robustness <- lapply(res_robustness,function(x){
#     if(unique(x$pathology.group) == "early"){
#       colnames(x) <- paste0("early.",colnames(x))
#     }
#     else if(unique(x$pathology.group) == "late"){
#       colnames(x) <- paste0("late.",colnames(x))
#     }
    
#     return(x)
#   })
  
#   res_robustness.early <- res_robustness[c(TRUE,FALSE)]
#   res_robustness.late <- res_robustness[c(FALSE,TRUE)]
  
  
#   names(res_robustness.early) <- lapply(res_robustness.early, function(x){
#     unique(x$early.cell_type)
#   })
  
#   names(res_robustness.late) <- lapply(res_robustness.late, function(x){
#     unique(x$late.cell_type)
#   })
  
#   res.rob <- lapply(names(res_robustness.late),function(x){
#     print(x)
#     out <- merge(res_robustness.early[[x]], 
#                 res_robustness.late[[x]], 
#                 by.x="early.gene", 
#                 by.y="late.gene",
#                 all=TRUE)

#     return(out)
#   })
  
#   print("Robustness merged by cell type")
  
#   names(res.rob) <- lapply(res.rob, function(x){
#     unique(x$KO.cell_type)
#   })
#   return(res.rob)
# }

# rob.res=.robustness_scoresfn(input_exp_data=adata_annot)

### **Jack Kniffing**

Jack-knifing is a resampling technique in which we repeat the analysis on a subset of samples to assess the robustness of the observed patterns. If results are driven by outlier samples, we expect to see a bimodal or multi-modal pattern when we jack-knife the analysis. `This technique is most usefull when we are dealing with large datasets with +10 controls and +10 experiment conditions.`

We can perform jack-knife analysis at two levels:

- `Leave-one-out-validation (LOOV)`: In this type of analysis, we iteratively exclude one subject/donor from the analysis and examine the p-value distribution of the genes.

- `50% resampling:` In this analysis, iterating 100 times, we randomly select 50% of donors from the cohort and repeat the analysis to see if the logFC pattern of genes remain the same or change. The number of iterations that the logFC remain same as the full cohort provide a measure of robustness on the patterns

Below we perform jack-knifing to assess the robustness of the DE patterns.

In [27]:
%%R

myJK_DEfn=function(inputExpData,dc,bkg_genes,de_method,norm_method,rand_var,covList,quantile_norm=T){
    res_arranged=NULL

    #requiring existence of at least 6 pseudocells for the DE analysis
    if(length(unique(inputExpData$pathology.group))>1&ncol(inputExpData)>5){
      
        #inputExpData=x;covariates=covList;randomEffect=rand_effect;bkg_genes=tmp_bkg_genes;quantile.norm=quantile_norm;prior.count=1
        res = .sconline.fitLimmaFn(inputExpData=inputExpData,
                                covariates=covList,
                                randomEffect = rand_var,
                                bkg_genes = bkg_genes,
                                quantile.norm = quantile_norm,
                                prior.count = 1,
                                DEmethod = de_method,
                                normalization = norm_method)
        #check the dc object, the usual consensus.correlation that I get is in the range of ~0.2 or above if rand=T
        
        if(sum(grepl("pathology.group", colnames(res$model)))>0){
            #Defining the comparisosn that we are interested in to know it's logFC and pval
            # Now, use this in makeContrasts
            contr <- makeContrasts(
                pathology.groupearly - pathology.groupno,
                pathology.grouplate - pathology.groupno,
                pathology.grouplate - pathology.groupearly,
                (pathology.groupearly + pathology.grouplate)/2 - pathology.groupno,
                levels = res$model
                )
            
            fit2=contrasts.fit(res$fit, contrasts=contr)
            #Explore setting robust=F
            fit2=eBayes(fit2,robust = T,trend=T)
            #running topTable for each contrast
            res_early=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.groupearly - pathology.groupno");
            res_late=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupno");
            res_lve=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="pathology.grouplate - pathology.groupearly");
            res_ad=topTable(fit2,number=dim(fit2)[1], adjust.method = "BH", coef="(pathology.groupearly + pathology.grouplate)/2 - pathology.groupno");
            
            res_early$gene=row.names(res_early)
            res_late$gene=row.names(res_early)
            res_ad$gene=row.names(res_ad)
            res_lve$gene=row.names(res_lve)

            colnames(res_early)=paste0("early.",colnames(res_early))
            colnames(res_late)=paste0("late.",colnames(res_late))
            colnames(res_ad)=paste0("ad.",colnames(res_ad))
            colnames(res_lve)=paste0("lve.",colnames(res_lve))

            # Merge res_early and res_late
            merged_early_late <- merge(res_early, res_late, by.x="early.gene", by.y="late.gene", all=TRUE)
            merged_lve <- merge(merged_early_late, res_lve, by.x="early.gene", by.y="lve.gene", all=TRUE)
            merged_all <- merge(merged_lve, res_ad, by.x="early.gene", by.y="ad.gene", all=TRUE)
            
            res_arranged <- merged_all

            res_arranged$blocked_analysis=res$blocked_analysis
            res_arranged$block.cor=res$dc$consensus.correlation #check the dc object, the usual consensus.correlation range is ~0.2 or above if rand=T
            res_arranged$cell_type=unique(as.character(x$cell_type))
            colnames(res_arranged)[colnames(res_arranged)=="early.gene"]="gene"
      }
    }
    return(res_arranged)
}

myJKloov_arrangeFn=function(inputList){
    pval_colname=colnames(inputList[[1]])[grepl("P\\.Value",colnames(inputList[[1]]))]
    logFC_colname=colnames(inputList[[1]])[grepl("logFC",colnames(inputList[[1]]))]
    for(i in 1:length(inputList)){
        inputList[[i]]$zscore=qnorm(inputList[[i]][,pval_colname]/2,lower.tail=F)*sign(inputList[[i]][,logFC_colname])
    }
    
    
    res=inputList[[1]][,c("gene","zscore")]
    for(i in 2:length(inputList)){
        res=suppressWarnings(merge(res,inputList[[i]][,c("gene","zscore")],by="gene"))
    }
    
    res_max_pval=apply(res[,-1],1,function(x) {
        y=table(sign(x))
        y=y[order(as.numeric(y),decreasing=T)]
        y=names(y)[1]
        x=pnorm(abs(x),lower.tail = F)*2
        x[sign(x)!=y]=1
        return(max(x))
    })
    res_max_pval=data.frame(gene=res[,1],max_pval=as.numeric(res_max_pval),stringsAsFactors = F)
    return(res_max_pval)
}


myJK50pct_arrangeFn=function(inputList){
    logFC_colname=colnames(inputList[[1]])[grepl("logFC",colnames(inputList[[1]]))]
    
    res=inputList[[1]][,c("gene",logFC_colname)]
    for(i in 2:length(inputList)){
        res=suppressWarnings(merge(res,inputList[[i]][,c("gene",logFC_colname)],by="gene"))
    }
    
    res_concordance_score=lapply(1:nrow(res),function(i) {
        x=as.numeric(res[i,-1])
        y=table(sign(x))/length(x)
        y=y[order(as.numeric(y),decreasing=T)]
        
        return(data.frame(gene=res[i,1],pattern=names(y)[1],score=y[1],jk_count=length(x),stringsAsFactors = F))
    })
    res_concordance_score=do.call("rbind",res_concordance_score)
    return(res_concordance_score)
}

#### **Leave-One-Out-Validation** 

The default DE analysis method can be time consuming, and hence making it impractical to apply iteratively. 

To remedy this, we make use of argument in the function. Providing the object substantially increases the performance by avoiding the calculation of the mixed-linear model parameters. Note that in general we need to re-calculate for each cohort and cell type individually. However, we can make the approximation that does not change substantially between the full cohort and the iterations in LOOV analysis since we are only excluding only one subject and our dataset is large. We only need values of from the and we have saved these values as in our DE analysis step.

In [28]:
%%R -i jack_knifing -o res_w_jk -o DE_results


########### Setting the parameters ################


if (filter_genes){
    bkg_gene_pct_thr = gene_celltype_threshold # selecting genes that are expressed in > 10% of cells
    bkg_gene_count_thr = 10
} else{
    bkg_gene_pct_thr = NULL
}

if (jack_knifing){

    res_w_jk <- list()

    for (strategy in pseudobulking_strategies){

        res_w_jk[[strategy]] <- list()

        for (de_method in deg_methods_to_run){

            res_w_jk[[strategy]][[de_method]] <- list()

            for (norm_method in normalization_methods){
                names(DE_results[[strategy]][[de_method]][[norm_method]])=unlist(lapply(DE_results[[strategy]][[de_method]][[norm_method]],
                                                                function(x) unique(x$cell_type)))

                for(iclust in names(pseudobulk_strategy_list[[strategy]])){
                    tmp_data=pseudobulk_strategy_list[[strategy]][[iclust]]
                    if(!is.null(bkg_gene_pct_thr)){
                        #background genes should be selectd based on the cell level expression data
                        tmp_bkg_genes = counts(adata_annot)[,which(adata_annot$cell_type==iclust)]
                        tmp_bkg_genes_counts=rowSums(tmp_bkg_genes>0)
                        tmp_bkg_genes_frac=tmp_bkg_genes_counts/sum(adata_annot$cell_type==iclust)
                        tmp_bkg_genes=row.names(adata_annot)[tmp_bkg_genes_frac>=bkg_gene_pct_thr&tmp_bkg_genes_counts>=bkg_gene_count_thr]
                        
                    } else {
                        tmp_bkg_genes=NULL
                    }

                    tmp_dc.object=list(consensus.correlation=unique(DE_results[[strategy]][[de_method]][[norm_method]]$block.cor))
                    res_list_early=list()
                    res_list_late=list()
                    res_list_ad=list()
                    res_list_lve=list()

                    
                    for(isbj in unique(tmp_data[[subject_id]])){
                        
                        
                        tmp_res = myJK_DEfn(inputExpData=tmp_data[, which(tmp_data[[subject_id]] != isbj)],
                                            dc=tmp_dc.object,
                                            bkg_genes=tmp_bkg_genes,
                                            de_method=de_method,
                                            norm_method=norm_method,
                                            quantile_norm=T,
                                            rand_var=subject_id,
                                            covList=covList)
                            
                                            
                        if(isbj %in% unique(tmp_data[[subject_id]][tmp_data$pathology.group %in% c("no", "early")])){
                            res_list_early = c(res_list_early, list(tmp_res[, c("gene", "early.logFC", "early.P.Value")]))
                        }
                        if(isbj %in% unique(tmp_data[[subject_id]][tmp_data$pathology.group %in% c("no", "late")])){
                            res_list_late = c(res_list_late, list(tmp_res[, c("gene", "late.logFC", "late.P.Value")]))
                        }
                        if(isbj %in% unique(tmp_data[[subject_id]][tmp_data$pathology.group %in% c("no", "late", "early")])){
                            res_list_ad = c(res_list_ad, list(tmp_res[, c("gene", "ad.logFC", "ad.P.Value")]))
                        }
                        if(isbj %in% unique(tmp_data[[subject_id]][tmp_data$pathology.group %in% c("late", "early")])){
                            res_list_lve = c(res_list_lve, list(tmp_res[, c("gene", "lvd.logFC", "lve.P.Value")]))
                        } 
                    }

                    res_jk_early=myJKloov_arrangeFn(inputList=res_list_early)
                    colnames(res_jk_early)[2]=paste0("jk_early.",colnames(res_jk_early)[2])
                    
                    res_jk_late=myJKloov_arrangeFn(inputList=res_list_late)
                    colnames(res_jk_late)[2]=paste0("jk_late.",colnames(res_jk_late)[2])
                    
                    res_jk_ad=myJKloov_arrangeFn(inputList=res_list_ad)
                    colnames(res_jk_ad)[2]=paste0("jk_ad.",colnames(res_jk_ad)[2])

                    res_jk_lve=myJKloov_arrangeFn(inputList=res_list_lve)
                    colnames(res_jk_lve)[2]=paste0("jk_lve.",colnames(res_jk_lve)[2])

                    res_jk=merge(res_jk_early, res_jk_late, by="gene")
                    res_jk=merge(res_jk, res_jk_ad, by='gene')
                    res_jk=merge(res_jk, res_lve, by='gene')
                    
                    res_jk$jk_early_partial_fdr=p.adjust(res_jk$k_early.max_pval[res_jk$early.adj.P.Val<0.1],method="BH")
                    res_jk$jk_late_partial_fdr=p.adjust(res_jk$k_late.max_pval[res_jk$late.adj.P.Val<0.1],method="BH")
                    res_jk$jk_ad_partial_fdr=p.adjust(res_jk$k_ad.max_pval[res_jk$ad.adj.P.Val<0.1],method="BH")
                    res_jk$jk_lve_partial_fdr=p.adjust(res_jk$k_lve.max_pval[res_jk$lve.adj.P.Val<0.1],method="BH")

                    res_w_jk[[strategy]][[de_method]][[norm_method]]=c(res_w_jk,list(merge(DE_results[['random']][[de_method]][[norm_method]][[iclust]],
                                                        res_jk,by="gene")))
                                                        
                    rm(res_jk,res_jk_early,res_jk_late,res_jk_ad,res_jk_lve,
                    tmp_bkg_genes,res_list_early,res_list_late,res_list_ad,res_list_lve)
                }
            }
        }
    }

} else {
    print('Number of unique subjects <= 10 or `jack_knifing` param set to `FALSE`....')
    print('Warning.....')
    print('Skipping Jack Knifing')
    res_w_jk <- NULL
}


[1] "Number of unique subjects <= 10 or `jack_knifing` param set to `FALSE`...."
[1] "Skipping Jack Knifing"


##### **Save Results**

In [29]:
for test_name in test_names:
    fig_dir = f'../results/{test_name}/{save_prefix}/DEG/'

    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    for strategy in pseudobulking_strategies:
        for de_method in deg_methods_to_run:
            for norm_method in normalization_methods:
                
                file_path = fig_dir + f'Limma_{de_method.capitalize()}_{strategy}_degs.xlsx'
                
                for cell_type in celltypes:

                    if test_name!='late_vs_early':
                        df = DE_results[strategy][de_method][norm_method][cell_type].loc[:,DE_results[strategy][de_method][norm_method][cell_type].columns.str.startswith(test_name.split('_vs_')[0])].copy()
                        df.columns = df.columns.str.replace(f'{test_name.split("_vs_")[0]}.', "")
                    elif test_name=='late_vs_early':
                        df = DE_results[strategy][de_method][norm_method][cell_type].loc[:,DE_results[strategy][de_method][norm_method][cell_type].columns.str.startswith('lve')].copy()
                        df.columns = df.columns.str.replace(f'lve.', "")

                    df['gene'] = DE_results[strategy][de_method][norm_method][cell_type]['gene'].copy()
                    df['cell_type'] = DE_results[strategy][de_method][norm_method][cell_type]['cell_type'].copy()
                    df['blocked_analysis'] = DE_results[strategy][de_method][norm_method][cell_type]['blocked_analysis'].copy()
                    if 'block.cor' in DE_results[strategy][de_method][norm_method][cell_type].columns:
                        df['block.cor'] = DE_results[strategy][de_method][norm_method][cell_type]['block.cor'].copy()
                    else:
                        df['block.cor'] = 'NA'
                    df['de_family'] = f'{strategy}_pseudobulk_strategy'
                    df['de_method'] = 'Limma'
                    df['de_type'] = de_method                  
                    df = df.rename(columns={'logFC': 'avg_logFC', 'P.Value': 'p_val', 'adj.P.Val': 'p_val_adj'})
                    df['abs_logFC'] = abs(df['avg_logFC'])   
                    df['direction'] = df['avg_logFC'].apply(lambda x: "up" if x>0 else "down") 

                    df = df[['cell_type', 'gene', 'avg_logFC', 'p_val', 'p_val_adj', 'de_family',
                                'de_method', 'de_type', 'abs_logFC', 'direction', 'blocked_analysis',
                                'block.cor']]
                    
                    if os.path.exists(file_path):
                        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
                            book = writer.book
                            if cell_type in book.sheetnames:
                                print(f"Sheet {cell_type} already exists in {file_path}, data will be overwritten!")
                            df.to_excel(writer, sheet_name=cell_type, na_rep='NA')
                    else:
                        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
                            df.to_excel(writer, sheet_name=cell_type, na_rep='NA')
