In [1]:
import os
import rpy2
import logging
import warnings
import anndata2ri
import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams

In [2]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

# rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
# rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
anndata2ri                  1.1
appnope                     0.1.3
asttokens                   NA
backcall                    0.2.0
cffi                        1.15.1
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
executing                   1.2.0
google                      NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2                      3.1.2
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.39.1
louvain                     0.8.0
markupsafe                  2.1.3
matplotlib                  3.7.1
mpl_toolkits                NA
natsort 

# **Load input data in raw form and save in `.h5ad` format**

### **List of all datasets present in Gazestani meta-analysis**

In [3]:
%%R

library(qs)
library(dplyr)
library(SingleCellExperiment)
library(readxl)
library(scMerge)
library(Seurat)

mic_annot <- qread("../data/raw/gazestani_pfc/annotations/MG_Final_anno.qs")

unique(mic_annot$ds_batch)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
     [1] "human_Lake_29227469_VisualCortex" "human_Li_nuclei_30545854"        
 [3] "human_Hodge_31435019"             "human_Bakken_34616062"           
 [5] "mouse_Chen_28355573"              "mouse_Keren-Shaul_28602351"      
 [7] "mouse_SalaFrigerio_exp1_31018141" "mouse_SalaFrigerio_exp2_31018141"
 [9] "mouse_Masuda_GSE120745"           "mouse_Masuda_GSE120629"          
[11] "mouse_Masuda_GSE120744"           "mouse_Sierksma_31951107"         
[13] "mouse_Yao_34616066_BICCN_sn"      "mouse_Zhou_31932797"             
[15] "mouse_dulken_31270459"            "mouse_Zywitza_30485812"          
[17] "mouse_Hammond_30471926"           "mouse_Hammond_30471926_LPC"      
[19] "mouse_Bhattacherjee_31519873"     "human_Zhou_31932797"             
[21] "human_Leng_33432193"              "human_Gerrits_3

qs 0.25.5

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count


Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeig

In [4]:
def convert_columns_to_string(sce, obs_cols=None, var_cols=None):
    """
    Convert specified columns in the .obs and .var DataFrames of an anndata object to string type.

    Parameters:
    sce (anndata.AnnData): The single-cell AnnData object to modify.
    obs_cols (list of str): Columns in sce.obs to convert to strings.
    var_cols (list of str): Columns in sce.var to convert to strings.
    """
    if obs_cols is not None:
        for col in obs_cols:
            sce.obs[col] = sce.obs[col].astype(str)
    
    if var_cols is not None:
        for col in var_cols:
            sce.var[col] = sce.var[col].astype(str)

def save_anndata(sce, file_path):
    """
    Save an AnnData object to a file.

    Parameters:
    sce (anndata.AnnData): The single-cell AnnData object to save.
    file_path (str): The path to save the file to.
    """
    sce.write_h5ad(file_path, compression='gzip')



def create_minimal_adata(adata_obj, output_file_path, gene_id_column=None):
    """
    Load an .h5ad file, create a minimal AnnData object from it, and save the minimal object.

    Parameters:
    input_file_path (str): Path to the original .h5ad file.
    output_file_path (str): Path where the minimal .h5ad file will be saved.
    gene_id_column (str, optional): Column name in adata.var where gene identifiers are stored. 
                                    If None, it is assumed gene identifiers are stored in adata.var.index.
    """
    try:
        # Load the original .h5ad file
        my_too_large_adata = adata_obj
        
        # Create a minimal AnnData object
        minimal_adata = ad.AnnData(my_too_large_adata.X)
        
        # Set gene identifiers
        if gene_id_column:
            # Option (b): If gene identifiers are stored in a specific column
            minimal_adata.var_names = my_too_large_adata.var[gene_id_column]
        else:
            # Option (a): If gene identifiers are stored in adata.var.index
            minimal_adata.var_names = my_too_large_adata.var_names
        
        # Set observation names
        minimal_adata.obs_names = my_too_large_adata.obs_names
        
        # Save the minimal AnnData object with gzip compression
        minimal_adata.write(output_file_path, compression='gzip')
        
        print(f"Minimal AnnData object has been saved to {output_file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


# Function to filter and update AnnData object based on cell barcode annotations
def filter_and_update_anndata(ad, annot):

    # Filter valid cell barcodes
    #valid_barcodes = ad.obs_names.intersection()
    ad_filtered = ad[annot['cell_barcode'].to_list()].copy()
    
    # Merge annotations
    ad_filtered.obs = ad_filtered.obs.merge(annot, left_on=ad_filtered.obs_names, right_on='cell_barcode', how='right')
    
    # Update obs_names with cell barcodes
    ad_filtered.obs_names = ad_filtered.obs['cell_barcode']
    
    return ad_filtered

## **[Gazestani et. al. 2019](https://doi.org/10.1016/j.cell.2023.08.005) (Prefrontal Cortex)**

In [5]:
%%R -o sce_list -o annot_list



# Function to read data
read_data <- function(path) {
  qread(path)
}

# Function to filter annotations based on ds_batch
filter_annotations <- function(annot, batch = 'human_NPH') {
  annot %>% filter(ds_batch == batch)
}

# Function to transform sample column
transform_sample <- function(sample_column, start_ind=3) {
  sapply(sample_column, function(x) {
    paste(unlist(strsplit(x, "_"))[start_ind:length(unlist(strsplit(x, "_")))], collapse = "_")
  })
}

# Function to subset SCE objects based on valid columns
subset_sce <- function(sce, annot) {
  valid_cols <- colnames(sce)[!is.na(match(colnames(sce), annot$cell_barcode))]
  sce[, valid_cols]
}

# Function to merge annotations with SCE object
merge_annotations <- function(sce, annot) {
  colData(sce)$cell_barcode <- colnames(sce)
  colData(sce) <- DataFrame(as(colData(sce), "data.frame") %>% 
                            right_join(annot, by = "cell_barcode"))
  colnames(sce) <- colData(sce)$cell_barcode
  return(sce)
}

# Paths
base_path <- "../data/raw/gazestani_pfc/organized_data/Human/brain/snRNA/Gazestani_0001"
annotation_path <- "../data/raw/gazestani_pfc/annotations"

# Cell types and annotations
cell_types <- c("Astro", "Endo", "ExN", "InN", "MG", "Oligo", "OPC")
annotations <- paste0(cell_types, "_Final_anno.qs")

# Read and filter data
sce_list <- lapply(paste0(base_path, "/", cell_types, "_data_arranged_updatedId_final_batches.qs"), read_data)
annot_list <- lapply(paste0(annotation_path, "/", annotations), read_data)
annot_list <- lapply(annot_list, filter_annotations)

# Assign names to lists
names(sce_list) <- cell_types
names(annot_list) <- cell_types

# Transform sample column, subset SCE objects, and merge annotations
for (cell_type in cell_types) {
  annot_list[[cell_type]]$cell_barcode <- transform_sample(annot_list[[cell_type]]$sample)
  sce_list[[cell_type]] <- subset_sce(sce_list[[cell_type]], annot_list[[cell_type]])
  sce_list[[cell_type]] <- merge_annotations(sce_list[[cell_type]], annot_list[[cell_type]])
}


In [6]:
dat_dir = f'../data/raw/gazestani_pfc/anndata/minimal/'

if not os.path.exists(dat_dir):
    os.makedirs(dat_dir)


# Define the columns to convert for obs and var
obs_columns_to_convert = ['anno_braak_score']
var_columns_to_convert = ['entrezid']

# Define the SCE objects and their corresponding file paths
sce_objects = sce_list

file_paths = {
    'ExN': "../data/raw/gazestani_pfc/anndata/excitatory_raw_anndata.h5ad",
    'InN': "../data/raw/gazestani_pfc/anndata/inhibitory_raw_anndata.h5ad",
    'Astro': "../data/raw/gazestani_pfc/anndata/astrocyte_raw_anndata.h5ad",
    'MG': "../data/raw/gazestani_pfc/anndata/microglia_raw_anndata.h5ad",
    'Oligo': "../data/raw/gazestani_pfc/anndata/oligodendrocyte_raw_anndata.h5ad",
    'OPC': "../data/raw/gazestani_pfc/anndata/opc_raw_anndata.h5ad",
    'Endo': "../data/raw/gazestani_pfc/anndata/endothelial_raw_anndata.h5ad"
}

# Loop through SCE objects to convert column types and save them
for sce_name, sce_obj in sce_objects.items():
    convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
    save_anndata(sce_obj, file_paths[sce_name])
    # create_minimal_adata(sce_obj, '/'.join(file_paths[sce_name].split('/')[:-1] + ['minimal/minimal_'+file_paths[sce_name].split('/')[-1]]))


In [8]:
metadata = pd.read_excel('../data/raw/gazestani_pfc/full_meta.xlsx')
metadata['pathology.group'] = metadata.Status.map({'Abeta': "early", 'AbetaTau': 'late', 'Ctrl': 'no'})
metadata['individualID'] = metadata['Sbj code'].astype(str)

metadata.to_csv('../data/raw/gazestani_pfc/gazestani_pfc_metadata.csv')

## **[Gerrits et. al. 2021](https://link.springer.com/article/10.1007/s00401-021-02263-w) (Occipital Cortex)**

In [9]:
%%R -o sce_list -o annot_list

# Function to transform sample column
transform_sample <- function(sample_column, start_ind=3) {
  sapply(sample_column, function(x) {
    paste(unlist(strsplit(x, "_"))[start_ind:length(unlist(strsplit(x, "_")))], collapse = "_")
  })
}

# Paths
base_path <- "../data/raw/gazestani_pfc/organized_data/Human/brain/snRNA/Gerrits_33609158"
annotation_path <- "../data/raw/gazestani_pfc/annotations"

# Cell types and annotations
cell_types <- c("MG")
annotations <- paste0(cell_types, "_Final_anno.qs")

# Read and filter data
sce_list <- lapply(paste0(base_path, "/", cell_types, "_data_arranged_updatedId_final_batches.qs"), read_data)
annot_list <- lapply(paste0(annotation_path, "/", annotations), read_data)
annot_list <- lapply(annot_list, function(x) filter_annotations(x, batch = 'human_Gerrits_33609158'))


# Assign names to lists
names(sce_list) <- cell_types
names(annot_list) <- cell_types

# Transform sample column, subset SCE objects, and merge annotations
for (cell_type in cell_types) {
  annot_list[[cell_type]]$cell_barcode <- transform_sample(annot_list[[cell_type]]$sample, 4)
  sce_list[[cell_type]] <- subset_sce(sce_list[[cell_type]], annot_list[[cell_type]])
  sce_list[[cell_type]] <- merge_annotations(sce_list[[cell_type]], annot_list[[cell_type]])
}



In [10]:
dat_dir = f'../data/raw/gerrits_otc/anndata/minimal/'

if not os.path.exists(dat_dir):
    os.makedirs(dat_dir)

# Define the columns to convert for obs and var
obs_columns_to_convert = ['anno_braak_score', 'anno_orig_cellState']
var_columns_to_convert = ['entrezid']

# Define the SCE objects and their corresponding file paths
sce_objects = sce_list

file_paths = {
    'MG': "../data/raw/gerrits_otc/anndata/microglia_raw_anndata.h5ad",
}

# Loop through SCE objects to convert column types and save them
for sce_name, sce_obj in sce_objects.items():
    convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
    save_anndata(sce_obj, file_paths[sce_name])
    create_minimal_adata(sce_obj, '/'.join(file_paths[sce_name].split('/')[:-1] + ['minimal/minimal_'+file_paths[sce_name].split('/')[-1]]))

Minimal AnnData object has been saved to ../data/raw/gerrits_otc/anndata/minimal_microglia_raw_anndata.h5ad


In [11]:
metadata = sce_list['MG'].obs.drop_duplicates(subset='title', keep='first')
metadata['pathology.group'] = metadata['sample.group.ch1'].map({'CTR+': "early", 'AD': 'late', 'CTR': 'no'})
metadata['individualID'] = metadata.title.astype(str)

metadata.to_csv('../data/raw/gerrits_otc/gerrits_otc_metadata.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['pathology_group'] = metadata['sample.group.ch1'].map({'CTR+': "early", 'AD': 'late', 'CTR': 'no'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['individualID'] = metadata.title.astype(str)


## **[Mathys et. al. 2019](https://www.nature.com/articles/s41586-019-1195-2) (Prefrontal Cortex)**

In [12]:
%%R -o annot_list

# Paths
annotation_path <- "../data/raw/gazestani_pfc/annotations"

# Cell types and annotations
cell_types <- c("Astro", "Endo", "ExN", "InN", "MG", "Oligo", "OPC")
annotations <- paste0(cell_types, "_Final_anno.qs")

# Read and filter data
annot_list <- lapply(paste0(annotation_path, "/", annotations), read_data)
annot_list <- lapply(annot_list, function(x) filter_annotations(x, batch = 'human_Mathys_31042697'))

# Assign names to lists
names(annot_list) <- cell_types

# Transform sample column, subset SCE objects, and merge annotations
for (cell_type in cell_types) {
  annot_list[[cell_type]]$cell_barcode <- transform_sample(annot_list[[cell_type]]$sample, 4)
}

In [13]:
adata = sc.read_mtx('../data/raw/mathys_pfc/notfiltered_count_matrix.mtx').T

obs_names = pd.read_csv('../data/raw/mathys_pfc/notfiltered_column_metadata.txt', sep='\t')
obs_names.TAG = obs_names.TAG.str.replace('.', '-')
obs_names.set_index('TAG', inplace=True)
obs_names.index.rename('index', inplace=True)

gene_names = pd.read_csv('../data/raw/mathys_pfc/notfiltered_gene_row_names.txt', sep='\t', header=None)

adata.obs = obs_names
adata.var_names = gene_names[1]


sce_list = {}
for cell_type, annot in annot_list.items():
    if len(annot_list[cell_type])>0:
        sce_list[cell_type] = filter_and_update_anndata(adata, annot_list[cell_type])


In [14]:
dat_dir = f'../data/raw/mathys_pfc/anndata/minimal/'

if not os.path.exists(dat_dir):
    os.makedirs(dat_dir)

# Define the columns to convert for obs and var
obs_columns_to_convert = ['anno_braak_score']
var_columns_to_convert = None

# Define the SCE objects and their corresponding file paths
sce_objects = sce_list

file_paths = {
    'ExN': "../data/raw/mathys_pfc/anndata/excitatory_raw_anndata.h5ad",
    'InN': "../data/raw/mathys_pfc/anndata/inhibitory_raw_anndata.h5ad",
    'Astro': "../data/raw/mathys_pfc/anndata/astrocyte_raw_anndata.h5ad",
    'MG': "../data/raw/mathys_pfc/anndata/microglia_raw_anndata.h5ad",
    'Oligo': "../data/raw/mathys_pfc/anndata/oligodendrocyte_raw_anndata.h5ad",
    'OPC': "../data/raw/mathys_pfc/anndata/opc_raw_anndata.h5ad",
    'Endo': "../data/raw/mathys_pfc/anndata/endothelial_raw_anndata.h5ad"
}

# Loop through SCE objects to convert column types and save them
for sce_name, sce_obj in sce_objects.items():
    sce_obj.var_names_make_unique()
    convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
    save_anndata(sce_obj, file_paths[sce_name])   
    create_minimal_adata(sce_obj, '/'.join(file_paths[sce_name].split('/')[:-1] + ['minimal/minimal_'+file_paths[sce_name].split('/')[-1]]))

Minimal AnnData object has been saved to ../data/raw/mathys_pfc/anndata/minimal_astrocyte_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/mathys_pfc/anndata/minimal_excitatory_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/mathys_pfc/anndata/minimal_inhibitory_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/mathys_pfc/anndata/minimal_microglia_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/mathys_pfc/anndata/minimal_oligodendrocyte_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/mathys_pfc/anndata/minimal_opc_raw_anndata.h5ad


In [15]:
metadata = sce_list['Astro'].obs.drop_duplicates(subset='projid', keep='first')
metadata['pathology.group'] = metadata['anno_braak_score'].map({'3': "early", '4': 'early', '5': 'late', '6': 'late', '0': 'no', '1': 'no', '2': 'no'})
metadata['individualID'] = metadata.projid.astype(str)

metadata.to_csv('../data/raw/mathys_pfc/mathys_pfc_metadata.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['pathology_group'] = metadata['anno_braak_score'].map({'3': "early", '4': 'early', '5': 'late', '6': 'late', '0': 'no', '1': 'no', '2': 'no'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['individualID'] = metadata.projid.astype(str)


## **[Leng et. al. 2021](https://www.nature.com/articles/s41593-020-00764-7) (Superior Frontal Gyrus & Entorhinal Cortex)**

In [16]:
%%R -o annot_list

# Paths
annotation_path <- "../data/raw/gazestani_pfc/annotations"

# Cell types and annotations
cell_types <- c("Astro", "Endo", "ExN", "InN", "MG", "Oligo", "OPC")
annotations <- paste0(cell_types, "_Final_anno.qs")

# Read and filter data
annot_list <- lapply(paste0(annotation_path, "/", annotations), read_data)
annot_list <- lapply(annot_list, function(x) filter_annotations(x, batch = 'human_Leng_33432193'))

# Assign names to lists
names(annot_list) <- cell_types

# Transform sample column, subset SCE objects, and merge annotations
for (cell_type in cell_types) {
  annot_list[[cell_type]]$cell_barcode <- transform_sample(annot_list[[cell_type]]$sample, 4)
}

In [18]:
# processed data obtained 
adata_leng = {}

readRDS = robjects.r['readRDS']
df_etc = readRDS('../data/raw/leng_etc/sce.EC.scAlign.assigned.rds')
adata_leng['etc'] = df_etc

# processed data obtained 
readRDS = robjects.r['readRDS']
df_sfg = readRDS('../data/raw/leng_sfg/sce.SFG.scAlign.assigned.rds')
adata_leng['sfg'] = df_sfg

cell_types = ['Exc', 'Inh', 'Astro', 'Endo', 'Micro', 'OPC', 'Oligo']

In [21]:
dat_dir = {'etc': f'../data/raw/leng_etc/anndata/minimal/', 'sfg': f'../data/raw/leng_sfg/anndata/minimal/'}


# Define the columns to convert for obs and var
obs_columns_to_convert = None
var_columns_to_convert = None

file_paths = {
    'etc': {
        'Exc': "../data/raw/leng_etc/anndata/excitatory_raw_anndata.h5ad",
        'Inh': "../data/raw/leng_etc/anndata/inhibitory_raw_anndata.h5ad",
        'Astro': "../data/raw/leng_etc/anndata/astrocyte_raw_anndata.h5ad",
        'Micro': "../data/raw/leng_etc/anndata/microglia_raw_anndata.h5ad",
        'Oligo': "../data/raw/leng_etc/anndata/oligodendrocyte_raw_anndata.h5ad",
        'OPC': "../data/raw/leng_etc/anndata/opc_raw_anndata.h5ad",
        'Endo': "../data/raw/leng_etc/anndata/endothelial_raw_anndata.h5ad"
        },

    'sfg':
        {   
        'Exc': "../data/raw/leng_sfg/anndata/excitatory_raw_anndata.h5ad",
        'Inh': "../data/raw/leng_sfg/anndata/inhibitory_raw_anndata.h5ad",
        'Astro': "../data/raw/leng_sfg/anndata/astrocyte_raw_anndata.h5ad",
        'Micro': "../data/raw/leng_sfg/anndata/microglia_raw_anndata.h5ad",
        'Oligo': "../data/raw/leng_sfg/anndata/oligodendrocyte_raw_anndata.h5ad",
        'OPC': "../data/raw/leng_sfg/anndata/opc_raw_anndata.h5ad",
        'Endo': "../data/raw/leng_sfg/anndata/endothelial_raw_anndata.h5ad"
        }
    }

for brain_region in ['etc', 'sfg']:

    if not os.path.exists(dat_dir[brain_region]):
        os.makedirs(dat_dir[brain_region])

    for sce_name in cell_types:
        sce_obj = adata_leng[brain_region][adata_leng[brain_region].obs.clusterCellType==sce_name]
        convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
        save_anndata(sce_obj, file_paths[brain_region][sce_name])   
        create_minimal_adata(sce_obj, '/'.join(file_paths[brain_region][sce_name].split('/')[:-1] + ['minimal/minimal_'+file_paths[brain_region][sce_name].split('/')[-1]]))

Minimal AnnData object has been saved to ../data/raw/leng_etc/anndata/minimal_excitatory_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_etc/anndata/minimal_inhibitory_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_etc/anndata/minimal_astrocyte_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_etc/anndata/minimal_endothelial_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_etc/anndata/minimal_microglia_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_etc/anndata/minimal_opc_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_etc/anndata/minimal_oligodendrocyte_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_sfg/anndata/minimal_excitatory_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/raw/leng_sfg/anndata/minimal_inhibitory_raw_anndata.h5ad
Minimal AnnData object has been saved to ../data/

## **[Gabitto et. al. 2021](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10246227/) (Middle Temporal Gyrus)**

In [5]:
adata = sc.read_h5ad('../data/raw/SEA-AD/MTG/RNASeq/SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad')

adata.obs['cell_type'] = adata.obs['Subclass'].copy()
adata.obs['cell_type'] = adata.obs['cell_type'].astype(str)
adata.obs.loc[adata.obs['Class'].str.startswith("Neuronal:"), 'cell_type'] = adata.obs['Class'][adata.obs['Class'].str.startswith("Neuronal:")].map({"Neuronal: Glutamatergic": "Excitatory",
                                                                "Neuronal: GABAergic": "Inhibitory"})

adata.obs.loc[adata.obs.cell_type=='Microglia-PVM', 'cell_type'] = 'Microglia'
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')

In [6]:
dat_dir = f'../data/raw/seaad_mtg/anndata/'

if not os.path.exists(dat_dir):
    os.makedirs(dat_dir)

# Define the columns to convert for obs and var
obs_columns_to_convert = None
var_columns_to_convert = None

# Define the SCE objects and their corresponding file paths

file_paths = {
    'Excitatory': "../data/raw/seaad_mtg/anndata/excitatory_raw_anndata.h5ad",
    'Inhibitory': "../data/raw/seaad_mtg/anndata/inhibitory_raw_anndata.h5ad",
    'Astrocyte': "../data/raw/seaad_mtg/anndata/astrocyte_raw_anndata.h5ad",
    'Microglia': "../data/raw/seaad_mtg/anndata/microglia_raw_anndata.h5ad",
    'Oligodendrocyte': "../data/raw/seaad_mtg/anndata/oligodendrocyte_raw_anndata.h5ad",
    'OPC': "../data/raw/seaad_mtg/anndata/opc_raw_anndata.h5ad",
    'Endothelial': "../data/raw/seaad_mtg/anndata/endothelial_raw_anndata.h5ad",
    'VLMC': "../data/raw/seaad_mtg/anndata/vlmc_raw_anndata.h5ad"
}


for sce_name in adata.obs.cell_type.unique():
    sce_obj = adata[adata.obs.cell_type==sce_name]
    convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
    save_anndata(sce_obj, file_paths[sce_name])   
    # create_minimal_adata(sce_obj, '/'.join(file_paths[sce_name].split('/')[:-1] + ['minimal/minimal_'+file_paths[sce_name].split('/')[-1]]))

## **[Gabitto et. al. 2021](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10246227/) (Dorso Lateral Prefrontal Cortex)**

In [5]:
adata = sc.read_h5ad('../data/raw/SEA-AD/DLPFC/RNASeq/SEAAD_DLPFC_RNAseq_final-nuclei.2024-02-13.h5ad')

adata.obs['cell_type'] = adata.obs['Subclass'].copy()
adata.obs['cell_type'] = adata.obs['cell_type'].astype(str)
adata.obs.loc[adata.obs['Class'].str.startswith("Neuronal:"), 'cell_type'] = adata.obs['Class'][adata.obs['Class'].str.startswith("Neuronal:")].map({"Neuronal: Glutamatergic": "Excitatory",
                                                                "Neuronal: GABAergic": "Inhibitory"})

adata.obs.loc[adata.obs.cell_type=='Microglia-PVM', 'cell_type'] = 'Microglia'
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')

In [7]:
dat_dir = f'../data/raw/seaad_pfc/anndata/'

if not os.path.exists(dat_dir):
    os.makedirs(dat_dir)

# Define the columns to convert for obs and var
obs_columns_to_convert = None
var_columns_to_convert = None

# Define the SCE objects and their corresponding file paths

file_paths = {
    'Excitatory': "../data/raw/seaad_pfc/anndata/excitatory_raw_anndata.h5ad",
    'Inhibitory': "../data/raw/seaad_pfc/anndata/inhibitory_raw_anndata.h5ad",
    'Astrocyte': "../data/raw/seaad_pfc/anndata/astrocyte_raw_anndata.h5ad",
    'Microglia': "../data/raw/seaad_pfc/anndata/microglia_raw_anndata.h5ad",
    'Oligodendrocyte': "../data/raw/seaad_pfc/anndata/oligodendrocyte_raw_anndata.h5ad",
    'OPC': "../data/raw/seaad_pfc/anndata/opc_raw_anndata.h5ad",
    'Endothelial': "../data/raw/seaad_pfc/anndata/endothelial_raw_anndata.h5ad",
    'VLMC': "../data/raw/seaad_pfc/anndata/vlmc_raw_anndata.h5ad"
}


for sce_name in adata.obs.cell_type.unique():
    sce_obj = adata[adata.obs.cell_type==sce_name]
    convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
    save_anndata(sce_obj, file_paths[sce_name])   
    # create_minimal_adata(sce_obj, '/'.join(file_paths[sce_name].split('/')[:-1] + ['minimal/minimal_'+file_paths[sce_name].split('/')[-1]]))