In [1]:
import scipy.io as sio, pandas as pd, anndata as ad, mygene, scanpy as sc, numpy as np, rapids_singlecell as rsc
import cupy as cp, gc, scipy.sparse as sp, pickle, os
from anndata import AnnData

## Assemble anndata object

In [4]:
in_dir = "mgh_all_obj_components"
prefix = "MGH_atac_all"
out_dir = "./"

In [5]:
X = sio.mmread(f"{in_dir}/{prefix}_counts.mtx").T.tocsr()
genes = pd.read_csv(f"{in_dir}/{prefix}_genes.csv")['Gene'].values
barcodes = pd.read_csv(f"{in_dir}/{prefix}_barcodes.csv")['Barcode'].values
metadata = pd.read_csv(f"{in_dir}/{prefix}_meta_data.csv", index_col=0)

# Ensure order matches
assert list(metadata.index) == list(barcodes)

# Create AnnData
adata = ad.AnnData(X=X, obs=metadata)
adata.var_names = genes
adata.obs_names = barcodes

# Set raw counts
adata.raw = adata

In [6]:
adata

AnnData object with n_obs × n_vars = 235470 × 618079
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'percent.hemo', 'doublet.filter', 'Jorstard.pred', 'harmony_clusters', 'seurat_clusters', 'remove_or_not', 'unintegrated_clusters', 'AD', 'Subclass', 'Subclass_fct', 'class', 'brain.region', 'barcode', 'nCount_ATAC', 'nFeature_ATAC'

In [7]:
# metadata.head()

In [8]:
# cat_cols = adata.obs.select_dtypes(include=["category"]).columns

In [9]:
# adata.obs[cat_cols] = adata.obs[cat_cols].astype("string")  # or .astype(str)

In [10]:
adata.write_h5ad(f"{out_dir}mgh_atac_all.h5ad")

... storing 'doublet.filter' as categorical
... storing 'Jorstard.pred' as categorical
... storing 'remove_or_not' as categorical
... storing 'AD' as categorical
... storing 'Subclass' as categorical
... storing 'Subclass_fct' as categorical
... storing 'class' as categorical
... storing 'brain.region' as categorical


## Use V3 preprocessing to generate precomputed for each cell type and data type

In [2]:
adata = sc.read_h5ad("mgh_rna_all.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 235470 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'percent.hemo', 'doublet.filter', 'Jorstard.pred', 'harmony_clusters', 'seurat_clusters', 'remove_or_not', 'unintegrated_clusters', 'AD', 'Subclass', 'Subclass_fct', 'class', 'brain.region', 'barcode', 'nCount_ATAC', 'nFeature_ATAC'

In [4]:
# # start from the unique (Subclass, class) pairs
# pairs = adata.obs[["Subclass", "class"]].dropna()

# map_df = (
#     pairs.drop_duplicates()
#          .merge(
#              pairs.groupby("Subclass", observed=True).size()
#                   .rename("cells_in_subclass")
#                   .reset_index(),
#              on="Subclass", how="left"
#          )
#          .sort_values(["Subclass", "class"])
#          .reset_index(drop=True)
# )
# map_df = map_df.sort_values(by = "cells_in_subclass", ascending = False)

In [5]:
# map_df

In [6]:
result_dir = "combined_analysis/adata_objects/"

In [7]:
# map_df.to_csv(os.path.join(result_dir, "subtype_mapping_counts.csv"), index = False)

In [8]:
cell_types = ["Oligo", "Astro", "Micro/PVM", "L2/3 IT", "L4 IT", "L5 IT", "L6 IT", "Pvalb", "Sst", "Vip"]
cell_types_alias = ["oligo", "astro", "microglia", "L23_IT", "L4_IT", "L5_IT", "L6_IT", "Pvalb", "Sst", "Vip"]

In [9]:
def preprocess(adata: AnnData, n_pcs,*, donor_key="orig.ident", ct_flt = "nCount_RNA", min_donor_frac=0.5, min_per_donor=0.01, use_gpu = True):
    if not np.issubdtype(adata.X.dtype, np.floating): adata.X = adata.X.astype(np.float32)
    
    u = adata.obs[ct_flt].astype(float)  # or "total_counts" if that's your column
    thr = max(u.quantile(0.999), 3*u.median())
    thr  # ← this single number is the “pretty-sure” doublet UMI cutoff

    # Basic QC on GPU, then donor-prevalence filter on CPU (min_gene = 1000 by default for rna and atac, 1500 for microglia(rna))
    rsc.get.anndata_to_GPU(adata); rsc.pp.filter_cells(adata, min_genes=1000, verbose = False)
    rsc.pp.filter_cells(adata, max_counts = thr, verbose = False)
    rsc.pp.filter_genes(adata, min_cells = int(0.01 * adata.n_obs), verbose = False)
    rsc.get.anndata_to_CPU(adata)

    donors = adata.obs[donor_key].to_numpy(); U, inv = np.unique(donors, return_inverse=True)
    X = adata.X.tocsr() if sp.issparse(adata.X) else sp.csr_matrix(adata.X)
    X_bin = X.copy(); X_bin.data[:] = 1
    pdp = np.vstack([(X_bin[inv==i].sum(axis=0).A1) / max(1, (inv==i).sum()) for i in range(len(U))]) if len(U) else np.zeros((0, X.shape[1]))
    keep = (pdp >= min_per_donor).mean(0) > min_donor_frac
    adata._inplace_subset_var(keep)

    rsc.get.anndata_to_GPU(adata)
    rsc.pp.normalize_total(adata, target_sum=1e4); rsc.pp.log1p(adata); 
    if use_gpu:
        rsc.tl.pca(adata, n_comps=n_pcs)
        rsc.get.anndata_to_CPU(adata)
    else:
        rsc.get.anndata_to_CPU(adata)
        sc.pp.pca(adata, n_comps=n_pcs)

    cp.get_default_memory_pool().free_all_blocks(); cp.get_default_pinned_memory_pool().free_all_blocks(); gc.collect()
    return adata

In [10]:
n_pcs = 30 # best so far = 30
mod = "rna"
flt = "nCount_RNA"
for ct, ct_alias in zip(cell_types, cell_types_alias):
    adata_sub = adata[adata.obs["Subclass"] == ct].copy()
    adata_sub = preprocess(adata_sub, n_pcs, ct_flt = flt, use_gpu = False)
    save_name = f"{result_dir}{ct_alias}_mgh_{mod}_v3_precomp.h5ad"
    adata_sub.write_h5ad(save_name)
    print(f"Wrote: {save_name}, n_cells: {adata_sub.n_obs}, n_features: {adata_sub.n_vars}")
    pd.Series(adata_sub.obs_names, name = "barcodes").to_csv(f"{result_dir}{ct_alias}_mgh_{mod}_v3_precomp_barcodes.csv", index = False)
    pd.Series(adata_sub.obs_names, name = "features").to_csv(f"{result_dir}{ct_alias}_mgh_{mod}_v3_precomp_features.csv", index = False)
    del adata_sub
    gc.collect()

Wrote: combined_analysis/adata_objects/oligo_mgh_rna_v3_precomp.h5ad, n_cells: 58453, n_features: 12347
Wrote: combined_analysis/adata_objects/astro_mgh_rna_v3_precomp.h5ad, n_cells: 24610, n_features: 14287
Wrote: combined_analysis/adata_objects/microglia_mgh_rna_v3_precomp.h5ad, n_cells: 5758, n_features: 12809
Wrote: combined_analysis/adata_objects/L23_IT_mgh_rna_v3_precomp.h5ad, n_cells: 19415, n_features: 17393
Wrote: combined_analysis/adata_objects/L4_IT_mgh_rna_v3_precomp.h5ad, n_cells: 12765, n_features: 17325
Wrote: combined_analysis/adata_objects/L5_IT_mgh_rna_v3_precomp.h5ad, n_cells: 9684, n_features: 17532
Wrote: combined_analysis/adata_objects/L6_IT_mgh_rna_v3_precomp.h5ad, n_cells: 3605, n_features: 18745
Wrote: combined_analysis/adata_objects/Pvalb_mgh_rna_v3_precomp.h5ad, n_cells: 5202, n_features: 16378
Wrote: combined_analysis/adata_objects/Sst_mgh_rna_v3_precomp.h5ad, n_cells: 3875, n_features: 15052
Wrote: combined_analysis/adata_objects/Vip_mgh_rna_v3_precomp.h5ad

### Automated downsampling for all cell types

In [2]:
# adata = sc.read_h5ad("mgh_atac_all.h5ad")

In [20]:
adata

AnnData object with n_obs × n_vars = 235470 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'percent.hemo', 'doublet.filter', 'Jorstard.pred', 'harmony_clusters', 'seurat_clusters', 'remove_or_not', 'unintegrated_clusters', 'AD', 'Subclass', 'Subclass_fct', 'class', 'brain.region', 'barcode', 'nCount_ATAC', 'nFeature_ATAC'

In [None]:
pd.crosstab(adata.obs["Subclass"], adata.obs["orig.ident"])

In [14]:
cell_types = ["Oligo", "Astro", "Micro/PVM", "L2/3 IT", "L4 IT", "L5 IT", "L6 IT", "Pvalb", "Sst", "Vip"]
cell_types_alias = ["oligo", "astro", "microglia", "L23_IT", "L4_IT", "L5_IT", "L6_IT", "Pvalb", "Sst", "Vip"]

In [12]:
def preprocess(adata: AnnData, n_pcs,*, donor_key="orig.ident", ct_flt = "nCount_RNA", min_donor_frac=0.5, min_per_donor=0.01, use_gpu = True):
    if not np.issubdtype(adata.X.dtype, np.floating): adata.X = adata.X.astype(np.float32)
    
    u = adata.obs[ct_flt].astype(float)  # or "total_counts" if that's your column
    thr = max(u.quantile(0.999), 3*u.median())
    thr  # ← this single number is the “pretty-sure” doublet UMI cutoff

    # Basic QC on GPU, then donor-prevalence filter on CPU (min_gene = 1000 by default for rna and atac, 1500 for microglia(rna))
    rsc.get.anndata_to_GPU(adata); rsc.pp.filter_cells(adata, min_genes=1000, verbose = False)
    rsc.pp.filter_cells(adata, max_counts = thr, verbose = False)
    rsc.pp.filter_genes(adata, min_cells = int(0.01 * adata.n_obs), verbose = False)
    rsc.get.anndata_to_CPU(adata)

    donors = adata.obs[donor_key].to_numpy(); U, inv = np.unique(donors, return_inverse=True)
    X = adata.X.tocsr() if sp.issparse(adata.X) else sp.csr_matrix(adata.X)
    X_bin = X.copy(); X_bin.data[:] = 1
    pdp = np.vstack([(X_bin[inv==i].sum(axis=0).A1) / max(1, (inv==i).sum()) for i in range(len(U))]) if len(U) else np.zeros((0, X.shape[1]))
    keep = (pdp >= min_per_donor).mean(0) > min_donor_frac
    adata._inplace_subset_var(keep)

    rsc.get.anndata_to_GPU(adata)
    rsc.pp.normalize_total(adata, target_sum=1e4); rsc.pp.log1p(adata); 
    if use_gpu:
        rsc.tl.pca(adata, n_comps=n_pcs)
        rsc.get.anndata_to_CPU(adata)
    else:
        rsc.get.anndata_to_CPU(adata)
        sc.pp.pca(adata, n_comps=n_pcs)

    cp.get_default_memory_pool().free_all_blocks(); cp.get_default_pinned_memory_pool().free_all_blocks(); gc.collect()
    return adata

In [13]:
def build_downsampled_minimal(
    adata, cell_types, *, subclass_col="Subclass", donor_col="orig.ident", ct_flt = "nCount_RNA",
    n_pcs=30, use_gpu=False, seed=42
):
    rng = np.random.default_rng(seed)

    # preprocess each requested subclass first (post-QC counts are the truth)
    pre = {}
    ad = adata[adata.obs[subclass_col].astype(str).isin([str(x) for x in cell_types])].copy()
    for ct in cell_types:
        sub = ad[ad.obs[subclass_col].astype(str).eq(str(ct))].copy()
        if sub.n_obs == 0:
            continue
        sub = preprocess(sub, n_pcs=n_pcs, ct_flt = ct_flt, use_gpu=use_gpu)
        sub.obs[donor_col] = sub.obs[donor_col].astype(str)
        pre[ct] = sub

    # choose minority AFTER preprocessing
    minority = min(pre, key=lambda k: pre[k].n_obs)
    min_counts = pre[minority].obs[donor_col].value_counts()

    # downsample others per donor to match minority per-donor counts
    out = {minority: pre[minority]}
    for ct, sub in pre.items():
        if ct == minority: 
            continue
        vc = sub.obs[donor_col].value_counts()
        keep = []
        for d in set(min_counts.index) & set(vc.index):
            k = int(min(min_counts[d], vc[d]))
            if k:
                pool = sub.obs.index[sub.obs[donor_col] == d].to_numpy()
                keep.extend(rng.choice(pool, size=k, replace=False))
        if keep:
            out[ct] = sub[keep].copy()

    # summary: rows = requested cell types (in given order), cols = donors present after DS
    donors = sorted({d for a in out.values() for d in a.obs[donor_col]})
    summ = pd.DataFrame(0, index=cell_types, columns=donors, dtype=int)
    for ct, a in out.items():
        vc = a.obs[donor_col].value_counts()
        summ.loc[ct, vc.index] = vc.values

    return out, summ

In [14]:
flt = "nCount_RNA"
ds_dict, summary_df = build_downsampled_minimal(
    adata, cell_types,
    subclass_col="Subclass", donor_col="orig.ident", ct_flt = flt, n_pcs=30, use_gpu=False
)

In [15]:
summary_df["total_cells"] = summary_df.sum(axis = 1)

In [16]:
summary_df

Unnamed: 0,2012,2021,2026,2027,2037,2057,2058,2068,2097,2112,2178,2191,2210,2232,2259,2274,2327,2339,2380,total_cells
Oligo,206,195,110,40,49,233,159,181,173,47,10,444,309,132,113,193,171,413,427,3605
Astro,206,195,110,40,49,233,159,181,173,47,10,444,309,132,95,193,171,413,427,3587
Micro/PVM,41,195,110,40,49,233,159,181,173,47,10,244,309,132,43,193,166,379,353,3057
L2/3 IT,206,195,33,40,49,233,159,181,173,47,10,444,309,132,113,193,171,413,427,3528
L4 IT,206,195,110,40,49,233,159,181,173,47,10,444,309,132,113,193,171,413,427,3605
L5 IT,206,195,110,31,49,233,159,181,173,47,10,444,309,132,113,193,171,413,427,3596
L6 IT,206,195,110,40,49,233,159,181,173,47,10,444,309,132,113,193,171,413,427,3605
Pvalb,184,195,76,37,49,233,159,181,173,47,10,354,309,132,75,193,171,413,427,3418
Sst,144,123,73,9,49,233,159,181,173,47,4,267,309,132,31,48,171,362,268,2783
Vip,132,160,29,9,49,233,159,181,173,47,10,283,223,132,53,74,171,356,427,2901


In [17]:
adata_dir = "combined_analysis_downsampled/adata_objects/"

In [18]:
mod = "rna"
for ct, ct_alias in zip(cell_types, cell_types_alias):
    adata_sub = ds_dict[ct]
    save_name = f"{adata_dir}{ct_alias}_mgh_{mod}_v3_precomp.h5ad"
    adata_sub.write_h5ad(save_name)
    print(f"Wrote: {save_name}, n_cells: {adata_sub.n_obs}, n_features: {adata_sub.n_vars}")
    pd.Series(adata_sub.obs_names, name = "barcodes").to_csv(f"{adata_dir}{ct_alias}_mgh_{mod}_v3_precomp_barcodes.csv", index = False)
    pd.Series(adata_sub.obs_names, name = "features").to_csv(f"{adata_dir}{ct_alias}_mgh_{mod}_v3_precomp_features.csv", index = False)

... storing 'orig.ident' as categorical
... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/oligo_mgh_rna_v3_precomp.h5ad, n_cells: 3605, n_features: 12347


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/astro_mgh_rna_v3_precomp.h5ad, n_cells: 3587, n_features: 14287


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/microglia_mgh_rna_v3_precomp.h5ad, n_cells: 3057, n_features: 12809


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/L23_IT_mgh_rna_v3_precomp.h5ad, n_cells: 3528, n_features: 17393


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/L4_IT_mgh_rna_v3_precomp.h5ad, n_cells: 3605, n_features: 17325


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/L5_IT_mgh_rna_v3_precomp.h5ad, n_cells: 3596, n_features: 17532


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/L6_IT_mgh_rna_v3_precomp.h5ad, n_cells: 3605, n_features: 18745


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/Pvalb_mgh_rna_v3_precomp.h5ad, n_cells: 3418, n_features: 16378


... storing 'orig.ident' as categorical


Wrote: combined_analysis_downsampled/adata_objects/Sst_mgh_rna_v3_precomp.h5ad, n_cells: 2783, n_features: 15052
Wrote: combined_analysis_downsampled/adata_objects/Vip_mgh_rna_v3_precomp.h5ad, n_cells: 2901, n_features: 15577


In [19]:
summary_df.to_csv(f"{adata_dir}{mod}_cell_counts.csv", index = True)

## Public dataset preprocessing

In [2]:
adata = sc.read_h5ad("microglia/rna_analysis/result/mg_public.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 174420 × 16228
    obs: 'Unnamed: 0', 'subject', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'brainRegion', 'batch', 'barcode', 'percent.mt', 'age_death', 'msex', 'pmi', 'ADdiag3types', 'percent.rp', 'seurat_clusters'

In [4]:
np.unique(adata.obs["ADdiag3types"], return_counts = True)

(array(['earlyAD', 'lateAD', 'nonAD'], dtype=object),
 array([49743, 29972, 94705]))

In [5]:
adata_sub = adata[adata.obs["ADdiag3types"] != 'earlyAD'].copy()

In [6]:
adata_sub

AnnData object with n_obs × n_vars = 124677 × 16228
    obs: 'Unnamed: 0', 'subject', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'brainRegion', 'batch', 'barcode', 'percent.mt', 'age_death', 'msex', 'pmi', 'ADdiag3types', 'percent.rp', 'seurat_clusters'

In [7]:
np.unique(adata_sub.obs["ADdiag3types"], return_counts = True)

(array(['lateAD', 'nonAD'], dtype=object), array([29972, 94705]))

In [9]:
np.unique(adata_sub.obs["brainRegion"], return_counts = True)

(array(['AngularGyrus', 'EntorhinalCortex', 'Hippocampus',
        'MidtemporalCortex', 'PFC', 'Thalamus'], dtype=object),
 array([ 6033, 17346, 22246,  7941, 53697, 17414]))

In [None]:
np.unique(adata_sub.obs["subject"].dropna().astype(str), return_counts = True)

In [14]:
vals, cnts = np.unique(adata_sub.obs["subject"].dropna().astype(str), return_counts=True)
keep = set(vals[cnts >= 100])
adata_sub = adata_sub[adata_sub.obs["subject"].notna() & adata_sub.obs["subject"].astype(str).isin(keep)].copy()

In [None]:
np.unique(adata_sub.obs["subject"].astype(str), return_counts = True)

In [19]:
adata_sub

AnnData object with n_obs × n_vars = 119778 × 16228
    obs: 'Unnamed: 0', 'subject', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'brainRegion', 'batch', 'barcode', 'percent.mt', 'age_death', 'msex', 'pmi', 'ADdiag3types', 'percent.rp', 'seurat_clusters'

In [20]:
np.unique(adata_sub.obs['age_death'], return_counts = True)

(array([ 70,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
         97,  98,  99, 100, 103, 104]),
 array([  106,   879,   817,   982,  2033,  4194,  7285,  2951,  3543,
          108,  4006,  2670,  4537,  8994,  3468,  6787, 11223, 10199,
         6942,  2854,  3616, 11827,  4892,  6799,  4274,   637,  1116,
          751,   560,   231,   301,   196]))

In [23]:
np.unique(adata_sub.obs["ADdiag3types"], return_counts = True)

(array(['lateAD', 'nonAD'], dtype=object), array([28286, 91492]))

In [24]:
def preprocess(adata: AnnData, n_pcs,*, donor_key="orig.ident", min_donor_frac=0.5, min_per_donor=0.01, use_gpu = True):
    if not np.issubdtype(adata.X.dtype, np.floating): adata.X = adata.X.astype(np.float32)
    
    u = adata.obs["nCount_RNA"].astype(float)  # or "total_counts" if that's your column
    thr = max(u.quantile(0.999), 3*u.median())
    thr  # ← this single number is the “pretty-sure” doublet UMI cutoff

    # Basic QC on GPU, then donor-prevalence filter on CPU (min_gene = 1000 by default for rna and atac, 1500 for microglia(rna))
    rsc.get.anndata_to_GPU(adata); rsc.pp.filter_cells(adata, min_genes=1000); rsc.pp.filter_cells(adata, max_counts = thr)
    rsc.pp.filter_genes(adata, min_cells = int(0.01 * adata.n_obs))
    rsc.get.anndata_to_CPU(adata)

    donors = adata.obs[donor_key].to_numpy(); U, inv = np.unique(donors, return_inverse=True)
    X = adata.X.tocsr() if sp.issparse(adata.X) else sp.csr_matrix(adata.X)
    X_bin = X.copy(); X_bin.data[:] = 1
    pdp = np.vstack([(X_bin[inv==i].sum(axis=0).A1) / max(1, (inv==i).sum()) for i in range(len(U))]) if len(U) else np.zeros((0, X.shape[1]))
    keep = (pdp >= min_per_donor).mean(0) > min_donor_frac
    adata._inplace_subset_var(keep)

    rsc.get.anndata_to_GPU(adata)
    rsc.pp.normalize_total(adata, target_sum=1e4); rsc.pp.log1p(adata); 
    if use_gpu:
        rsc.tl.pca(adata, n_comps=n_pcs)
        rsc.get.anndata_to_CPU(adata)
    else:
        rsc.get.anndata_to_CPU(adata)
        sc.pp.pca(adata, n_comps=n_pcs)

    cp.get_default_memory_pool().free_all_blocks(); cp.get_default_pinned_memory_pool().free_all_blocks(); gc.collect()
    return adata

In [25]:
n_pcs = 30 # best so far = 30
adata_sub = preprocess(adata_sub, n_pcs, donor_key = "subject", use_gpu = False)

filtered out 21429 cells that have less than 1000 genes expressed
filtered out 126 cells that have more than 11817.41399015938 counts
filtered out 4659 genes that are detected in less than 982 cells


In [26]:
adata_sub

AnnData object with n_obs × n_vars = 98223 × 11209
    obs: 'Unnamed: 0', 'subject', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'brainRegion', 'batch', 'barcode', 'percent.mt', 'age_death', 'msex', 'pmi', 'ADdiag3types', 'percent.rp', 'seurat_clusters', 'n_counts', 'n_genes'
    var: 'n_counts', 'n_cells'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [28]:
np.unique(adata_sub.obs["subject"].astype(str)).shape

(213,)

In [29]:
adata_sub.write_h5ad("combined_analysis/adata_objects/public_dataset/microglia_rosmap_AD_binary_donor_flt_v3_precomp.h5ad")