In [None]:
import os
import logging
import warnings
from gsva_prep import prep_gsva
from itertools import chain
from functions import helper_functions


## **Data Prep Parameters**

- `map_meta`: Indicates if metadata mapping is required for `pathology.group`. If set to False, `pathology.group` must exist in `adata.obs`

- `test_names`: List of the different test names of interest.

- `save_prefix`: Preferred prefix for saving critical files. Ideally chosen to be in the format `{source name}_{brain region}`. e.g `mathys_pfc`

- `subject_id`: Column name for Subject/Patient ID in both metadata and `.obs`

In [None]:
save_prefix = 'seaad_mtg'                           # this takes the format '{StudyName}_{ThreeLetterAccronymForBrainRegion}'
subject_id = 'Donor ID'  
cell_type_column = 'Subclass'                       # 'Supertype (non-expanded)', 'Subclass'
region_name = save_prefix.split('_')[-1].upper()

data_dir = f'/media/tadeoye/Volume1/data/seq/SEA-AD/{region_name}/RNAseq/'

subclass = {
    'excitatory': ['L5 IT', 'L2/3 IT', 'L4 IT', 'L6 IT', 'L6 IT Car3', 'L5/6 NP', 'L6b', 'L6 CT', 'L5 ET'],
    'inhibitory': ['Pvalb', 'Sst', 'Lamp5 Lhx6', 'Vip', 'Lamp5', 'Sncg', 'Chandelier', 'Sst Chodl', 'Pax6'],
    'astrocyte': ['Astrocyte'],
    'microglia': ['Microglia-PVM'],
    'opc': ['OPC'],
    'oligodendrocyte': ['Oligodendrocyte'],
    'endothelial': ['Endothelial'],
    'vlmc': ['VLMC'],
    }

cell_supertype = list(chain(*list(subclass.values())))

## **Load and Prep Data**

load the preprocessed and annotated data.

Make sure the anndata has the count data in either `.layers` or in `.X`. if count data is not in `.layers` it will be assumed that `.X ` contains the counts

In [None]:
# adata_annot = sc.read_h5ad(f'../data/raw/{save_prefix}/anndata/rna/{cell_group}_raw_anndata.h5ad')
adata_annot = sc.read_h5ad(data_dir + f'anndata/{save_prefix.upper()}_RNAseq_final-nuclei.2024-02-13.h5ad')
adata_annot.obs_names_make_unique()
adata_annot.var_names_make_unique()

if 'counts' not in adata_annot.layers.keys():
    print('"counts" not in layers...')
    print('analysis requires unnormalized count data...')
    print('"adata.X" will be used as count data...')

# del adata_annot.obsm, adata_annot.varm, adata_annot.uns, adata_annot.obsp, adata_annot.var, adata_annot.layers

celltypes = list(adata_annot.obs[cell_type_column].unique())

# map the pathology group to the subject id in .obs

adata_annot = adata_annot[adata_annot.obs.Subclass.isin(cell_supertype)]
adata_annot = adata_annot[adata_annot.obs['Neurotypical reference'] == 'False']
adata_annot = adata_annot[adata_annot.obs['Severely Affected Donor'] == 'N']

adata_annot

"counts" not in layers...
analysis requires unnormalized count data...
"adata.X" will be used as count data...


View of AnnData object with n_obs × n_vars = 1164766 × 36601
    obs: 'sample_id', 'Neurotypical reference', 'Donor ID', 'Organism', 'Brain Region', 'Sex', 'Gender', 'Age at Death', 'Race (choice=White)', 'Race (choice=Black/ African American)', 'Race (choice=Asian)', 'Race (choice=American Indian/ Alaska Native)', 'Race (choice=Native Hawaiian or Pacific Islander)', 'Race (choice=Unknown or unreported)', 'Race (choice=Other)', 'specify other race', 'Hispanic/Latino', 'Highest level of education', 'Years of education', 'PMI', 'Fresh Brain Weight', 'Brain pH', 'Overall AD neuropathological Change', 'Thal', 'Braak', 'CERAD score', 'Overall CAA Score', 'Highest Lewy Body Disease', 'Total Microinfarcts (not observed grossly)', 'Total microinfarcts in screening sections', 'Atherosclerosis', 'Arteriolosclerosis', 'LATE', 'Cognitive Status', 'Last CASI Score', 'Interval from last CASI in months', 'Last MMSE Score', 'Interval from last MMSE in months', 'Last MOCA Score', 'Interval from last MO

# **Parameters for Testing Differential Pathway Activity**

The primary differential pathway activity analaysis:

- `covariates`: Specifies variables to account for in the analysis:
    - Must include `Continuous Pseudo-progression Score` which will later on be matched to`pathology.group` as the primary variable of interest.

    - Can include additional confounding variables like:
        - Demographic factors (e.g., `Sex`, `Age`)
        - Technical factors (e.g., `Sample Batch`)

    - If there are no additional confounders, set `covariates = ['Continuous Pseudo-progression Score']`.

    - For the best results, ensure:
        - continuous covariates are similarly scaled.
        - uniformity in categorical data. E.g., Avoid having 'M', 'Male', and 'male' in the same dataset.
        - regularly assess the impact of covariates by running the analyses with and without each covariate. Watch out for covariates unduly influencing results. Watch out for covariates unduly influencing results.

- `random_effect`: Identifies the technical variable to be treated as random effects (in this case `subject_id` or `Sample_batch`). These effects are regressed out using `duplicatecorrection`. This would help account or technical variation not relevant to the biological question.

In [None]:
######################### Differential Pathway Activity arguments #########################
random_effect = helper_functions.clean_strings([subject_id])[0]     # Technical covariate to be included as random effect not of interest (regressed out by duplicatecorrection)
subject_id = random_effect

covariates = [
    'Continuous Pseudo-progression Score',
    'Age at Death binned codes',
    'Sex',
    'PMI',
    'Genes detected',
    'Number of UMIs',
    'method',
    'Race (choice=White)',
    'APOE4_Status'
]

covariates = helper_functions.clean_strings(covariates)

numeric_covariates = helper_functions.clean_strings(['Age at Death',
                                            'Continuous Pseudo-progression Score', 
                                            'PMI',
                                            'Genes detected',
                                            'Number of UMIs'])



# **Standardize Covariate Column Names in `adata.obs` and split/save adata at cell sublcass level**


Prior to differential pathway analysis, we implement the comprehensive covariate standardization and categorization employed **[here](https://github.com/AllenInstitute/SEA-AD_2024/blob/main/Single%20nucleus%20omics/04_Differential%20expression%20analysis/00_Split%20AnnData%20for%20nebula.ipynb)** to ensure robust statistical comparisons. 

Continuous variables were normalized to a [0,1] interval using min-max scaling, including `post-mortem interval (PMI)` and `the number of genes detected per cell`. 

`Age at death` was discretized into five equal-width bins and subsequently normalized to account for non-linear age effects.

Categorical variables underwent systematic preprocessing to ensure consistent encoding. `Sex` was binary-encoded as F/M, while `APOE4` carrier status was derived from genotype information and encoded as Y/N. Race information, specifically White identification, was encoded as Checked/Unchecked. Technical covariates, including sequencing method and donor ID, were retained with cleaned category levels. 

For quality control metrics, we normalized the `number of unique molecular identifiers (UMIs)` and `genes detected per cell`,to enabling direct comparisons across samples.


In [None]:
adata_annot.obs["continuous_pseudo_progression_score"] = adata_annot.obs["Continuous Pseudo-progression Score"].copy()


adata_annot.obs["Age at Death"] = adata_annot.obs["Age at Death"].astype("object")
adata_annot.obs["Age at Death"] = [np.float32(i) for i in adata_annot.obs["Age at Death"]]
adata_annot.obs["Age_at_Death_binned"] = pd.cut(adata_annot.obs["Age at Death"], bins=5)
adata_annot.obs["age_at_death_binned_codes"] = adata_annot.obs["Age_at_Death_binned"].cat.codes
adata_annot.obs["age_at_death_binned_codes"] = adata_annot.obs["age_at_death_binned_codes"] /  adata_annot.obs["age_at_death_binned_codes"].max()

adata_annot.obs["sex"] = adata_annot.obs["Sex"].astype("category")
adata_annot.obs["sex"] = adata_annot.obs["sex"].cat.remove_unused_categories()
adata_annot.obs["sex"] = adata_annot.obs["sex"].cat.reorder_categories(["Female", "Male"])

adata_annot.obs["race_choice_white"] = adata_annot.obs["Race (choice=White)"].astype("category")
adata_annot.obs["race_choice_white"] = adata_annot.obs["race_choice_white"].cat.remove_unused_categories()
adata_annot.obs["race_choice_white"] = adata_annot.obs["race_choice_white"].cat.reorder_categories(["Unchecked", "Checked"])

adata_annot.obs["method"] = adata_annot.obs["method"].cat.remove_unused_categories()

adata_annot.obs["genes_detected"] = (adata_annot.obs["Genes detected"] - adata_annot.obs["Genes detected"].min()) /\
                                     (adata_annot.obs["Genes detected"].max() - adata_annot.obs["Genes detected"].min())

adata_annot.obs["donor_id"] = adata_annot.obs["Donor ID"].copy()

adata_annot.obs["number_of_umis"] = adata_annot.obs["Number of UMIs"].copy()

adata_annot.obs["PMI"] = adata_annot.obs["PMI"].astype(float)
adata_annot.obs["pmi"] = (adata_annot.obs["PMI"] - adata_annot.obs["PMI"].min()) /  (adata_annot.obs["PMI"].max() - adata_annot.obs["PMI"].min())

adata_annot.obs["apoe4_status"] = adata_annot.obs["APOE Genotype"].str.contains("4")
adata_annot.obs["apoe4_status"] = adata_annot.obs["apoe4_status"].astype("category")
adata_annot.obs["apoe4_status"] = adata_annot.obs["apoe4_status"].cat.reorder_categories([False, True])
adata_annot.obs["apoe4_status"] = adata_annot.obs["apoe4_status"].cat.rename_categories(
    {
        False: "N",
        True: "Y",
    }
)

for col in adata_annot.obs.keys():
    if col not in covariates and col != cell_type_column and col!=random_effect:
            try:
                del adata_annot.obs[col]
            except KeyError:
                continue

adata_annot.write_h5ad(data_dir+f'anndata/all_subclass_standardized_anndata.h5ad', compression='gzip')

# split and save by subclass

for subclass_group in cell_supertype:
     adata_annot2 = adata_annot[adata_annot.obs[cell_type_column].isin([subclass_group])]
     subclass_group = helper_functions.clean_strings([subclass_group], preserve_case=True)[0]
     adata_annot2.write_h5ad(data_dir+f'anndata/{subclass_group}_standardized_anndata.h5ad', compression='gzip')

adata_annot

In [2]:
import os
import rpy2
import logging
import warnings
import anndata2ri
import numpy as np
import pandas as pd
import scanpy as sc
import decoupler as dc
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from itertools import chain
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams
from functions import helper_functions

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
anndata2ri                  1.1
asttokens                   NA
backcall                    0.2.0
cffi                        1.15.1
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
decoupler                   1.4.0
executing                   1.2.0
functions                   NA
google                      NA
gsva_prep                   NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2                      3.1.2
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.39.1
louvain                     0.8.0
markupsafe                  2.1.3
matplotlib 

In [4]:
%%R

suppressPackageStartupMessages({
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(readxl)
    library(conflicted)
    library(dplyr)
    library(parallel)
    library(stringr)

    # single-cell analysis package
    library(Seurat)
    library(zellkonverter)   
    library(SingleCellExperiment)
    library(tidyr)
    library(readxl)
    library(GSA)
    library(limma)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)
    library(ggplot2)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)
    library(GSEABase)
    library(GSVA) 

    # cell-cell communication
    library(nichenetr)

# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
enableWGCNAThreads(nThreads = 40)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    Allowing parallel execution with up to 40 working processes.


1: replacing previous import ‘GenomicRanges::intersect’ by ‘SeuratObject::intersect’ when loading ‘hdWGCNA’ 
2: replacing previous import ‘GenomicRanges::union’ by ‘dplyr::union’ when loading ‘hdWGCNA’ 
3: replacing previous import ‘GenomicRanges::setdiff’ by ‘dplyr::setdiff’ when loading ‘hdWGCNA’ 
4: replacing previous import ‘dplyr::as_data_frame’ by ‘igraph::as_data_frame’ when loading ‘hdWGCNA’ 
5: replacing previous import ‘Seurat::components’ by ‘igraph::components’ when loading ‘hdWGCNA’ 
6: replacing previous import ‘dplyr::groups’ by ‘igraph::groups’ when loading ‘hdWGCNA’ 
7: replacing previous import ‘dplyr::union’ by ‘igraph::union’ when loading ‘hdWGCNA’ 
8: replacing previous import ‘GenomicRanges::subtract’ by ‘magrittr::subtract’ when loading ‘hdWGCNA’ 
9: replacing previous import ‘Matrix::as.matrix’ by ‘proxy::as.matrix’ when loading ‘hdWGCNA’ 
10: replacing previous import ‘igraph::groups’ by ‘tidygraph::groups’ when loading ‘hdWGCNA’ 


In [None]:
%%R -o hsko

set.seed(123)   
library(scGSVA)   
hsko<-buildAnnot(species="human",keytype="SYMBOL",anntype="GO", "OP"='bp')

In [1]:
%%R -o hsko

print(hsko)

UsageError: Cell magic `%%R` not found.


In [None]:
hsko

   <NA>  <NA>  <NA>
NA.262     <NA>  <NA>  <NA>
NA.263     <NA>  <NA>  <NA>
NA.264     <NA>  <NA>  <NA>
NA.265     <NA>  <NA>  <NA>
NA.266     <NA>  <NA>  <NA>
NA.267     <NA>  <NA>  <NA>
NA.268     <NA>  <NA>  <NA>
NA.269     <NA>  <NA>  <NA>
NA.270     <NA>  <NA>  <NA>
NA.271     <NA>  <NA>  <NA>
NA.272     <NA>  <NA>  <NA>
NA.273     <NA>  <NA>  <NA>
NA.274     <NA>  <NA>  <NA>
NA.275     <NA>  <NA>  <NA>
NA.276     <NA>  <NA>  <NA>
NA.277     <NA>  <NA>  <NA>
NA.278     <NA>  <NA>  <NA>
NA.279     <NA>  <NA>  <NA>
NA.280     <NA>  <NA>  <NA>
NA.281     <NA>  <NA>  <NA>
NA.282     <NA>  <NA>  <NA>
NA.283     <NA>  <NA>  <NA>
NA.284     <NA>  <NA>  <NA>
NA.285     <NA>  <NA>  <NA>
NA.286     <NA>  <NA>  <NA>
NA.287     <NA>  <NA>  <NA>
NA.288     <NA>  <NA>  <NA>
NA.289     <NA>  <NA>  <NA>
NA.290     <NA>  <NA>  <NA>
NA.291     <NA>  <NA>  <NA>
NA.292     <NA>  <NA>  <NA>
NA.293     <NA>  <NA>  <NA>
NA.294     <NA>  <NA>  <NA>
NA.295     <NA>  <NA>  <NA>
NA.296     <NA>  <NA>  <NA>
