In [1]:
import os
import rpy2
import logging
import warnings
import anndata2ri
import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from matplotlib.pyplot import rcParams

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
anndata2ri                  1.1
appnope                     0.1.3
asttokens                   NA
backcall                    0.2.0
cffi                        1.15.1
comm                        0.1.3
cpuinfo                     NA
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
executing                   1.2.0
google                      NA
gsva_prep                   NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2                      3.1.2
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.39.1
louvain                     0.8.0
markupsafe                  2.1.3
matplotlib 

# **Load input data in `.h5ad` format and map to cell class annotation provided in [MapMyCells](https://portal.brain-map.org/atlases-and-data/bkp/mapmycells)**

## **[Gazestani et. al. 2019](https://doi.org/10.1016/j.cell.2023.08.005) (Prefrontal Cortex)**

In [None]:
def convert_columns_to_string(sce, obs_cols=None, var_cols=None):
    """
    Convert specified columns in the .obs and .var DataFrames of an anndata object to string type.

    Parameters:
    sce (anndata.AnnData): The single-cell AnnData object to modify.
    obs_cols (list of str): Columns in sce.obs to convert to strings.
    var_cols (list of str): Columns in sce.var to convert to strings.
    """
    if obs_cols is not None:
        for col in obs_cols:
            sce.obs[col] = sce.obs[col].astype(str)
    
    if var_cols is not None:
        for col in var_cols:
            sce.var[col] = sce.var[col].astype(str)

def save_anndata(sce, file_path):
    """
    Save an AnnData object to a file.

    Parameters:
    sce (anndata.AnnData): The single-cell AnnData object to save.
    file_path (str): The path to save the file to.
    """
    sce.write_h5ad(file_path)


# Function to filter and update AnnData object based on cell barcode annotations
def filter_and_update_anndata(ad, annot):

    # Filter valid cell barcodes
    #valid_barcodes = ad.obs_names.intersection()
    ad_filtered = ad[annot['cell_barcode'].to_list()].copy()
    
    # Merge annotations
    ad_filtered.obs = ad_filtered.obs.merge(annot, left_on=ad_filtered.obs_names, right_on='cell_barcode', how='right')
    
    # Update obs_names with cell barcodes
    ad_filtered.obs_names = ad_filtered.obs['cell_barcode']
    
    return ad_filtered

In [12]:
celltypes = ['excitatory', 'inhibitory', 'microglia', 'astrocyte', 'oligodendrocyte', 'endothelial', 'opc']

In [13]:
dat_dir = f'../data/raw/gazestani_pfc/anndata/'
files = os.listdir(dat_dir+'cell_type_mapping/')

mapping = {}
for cell_type in celltypes:
    file = next((item for item in files if cell_type in item), None)
    if file is not None:
     mapping[cell_type] = pd.read_csv(dat_dir+'cell_type_mapping/'+file, skiprows=3)
     mapping[cell_type]





In [52]:
mapping['astrocyte']

Unnamed: 0,cell_id,class_label,class_name,class_softmax_probability,subclass_label,subclass_name,subclass_softmax_probability,supertype_label,supertype_name,supertype_softmax_probability
0,1020Y1_TTAATCCCAGCTGTGC-1,CS20230505_CLAS_0003,Non-neuronal and Non-neural,0.9582,CS20230505_SUBC_0021,Oligodendrocyte,0.9582,CS20230505_SUPT_0120,Oligo_1,0.9582
1,1020Y1_AATTTCCTCAAACGAA-1,CS20230505_CLAS_0002,Neuronal: Glutamatergic,0.9594,CS20230505_SUBC_0015,L5 ET,0.5461,CS20230505_SUPT_0091,L5 ET_2,0.5461
2,1020Y1_GTGTCCTTCTCACTCG-1,CS20230505_CLAS_0002,Neuronal: Glutamatergic,0.9954,CS20230505_SUBC_0015,L5 ET,0.9950,CS20230505_SUPT_0091,L5 ET_2,0.9950
3,1020Y1_ACTTTCATCGTGGCTG-1,CS20230505_CLAS_0002,Neuronal: Glutamatergic,0.9837,CS20230505_SUBC_0010,L2/3 IT,0.9830,CS20230505_SUPT_0084,L2/3 IT_3,0.9830
4,1020Y1_TCCGGGATCCAAGCTA-1,CS20230505_CLAS_0002,Neuronal: Glutamatergic,0.9996,CS20230505_SUBC_0010,L2/3 IT,0.9996,CS20230505_SUPT_0073,L2/3 IT_13,0.9996
...,...,...,...,...,...,...,...,...,...,...
73482,1074D_TTCCTCTCACTTCCTG-1,CS20230505_CLAS_0003,Non-neuronal and Non-neural,0.9460,CS20230505_SUBC_0024,Microglia-PVM,0.7433,CS20230505_SUPT_0138,Micro-PVM_4-SEAAD,0.7433
73483,1074D_ATCCGTCTCTAGGAAA-1,CS20230505_CLAS_0001,Neuronal: GABAergic,0.5661,CS20230505_SUBC_0008,Pvalb,0.5005,CS20230505_SUPT_0054,Pvalb_5,0.5005
73484,1074D_CGTGTCTAGCGTGTCC-1,CS20230505_CLAS_0003,Non-neuronal and Non-neural,0.5853,CS20230505_SUBC_0021,Oligodendrocyte,0.5706,CS20230505_SUPT_0124,Oligo_3,0.5706
73485,1074D_TTCAATCGTACTAACC-1,CS20230505_CLAS_0002,Neuronal: Glutamatergic,0.5946,CS20230505_SUBC_0016,L6 CT,0.4754,CS20230505_SUPT_0093,L6 CT_2,0.4754


In [45]:
subclass_map = {key:'OPC_'+str(ind) for ind, key in enumerate(mapping[cell_type].subclass_name.unique())}

In [48]:
map_list = mapping[cell_type].subclass_name.unique()

array(['L2/3 IT', 'Oligodendrocyte', 'L4 IT', 'Microglia-PVM', 'Sncg',
       'Vip', 'L6 CT', 'OPC', 'L5 IT', 'VLMC', 'Lamp5', 'Sst', 'L5/6 NP',
       'Endothelial', 'L5 ET', 'Pax6', 'L6 IT Car3', 'Chandelier',
       'Astrocyte', 'L6 IT', 'Lamp5 Lhx6', 'L6b'], dtype=object)

In [None]:
'minimal/cell_type_mapping/'


metadata = pd.read_excel('../data/raw/gazestani_pfc/full_meta.xlsx')
metadata['pathology_group'] = metadata.Status.map({'Abeta': "early", 'AbetaTau': 'late', 'Ctrl': 'no'})
metadata['individualID'] = metadata['Sbj code'].astype(str)

metadata.to_csv('../data/raw/gazestani_pfc/gazestani_pfc_metadata.csv')

## **[Gerrits et. al. 2021](https://link.springer.com/article/10.1007/s00401-021-02263-w) (Occipital Cortex)**

In [None]:
%%R -o sce_list -o annot_list

# Function to transform sample column
transform_sample <- function(sample_column, start_ind=3) {
  sapply(sample_column, function(x) {
    paste(unlist(strsplit(x, "_"))[start_ind:length(unlist(strsplit(x, "_")))], collapse = "_")
  })
}

# Paths
base_path <- "../data/raw/gazestani_pfc/organized_data/Human/brain/snRNA/Gerrits_33609158"
annotation_path <- "../data/raw/gazestani_pfc/annotations"

# Cell types and annotations
cell_types <- c("MG")
annotations <- paste0(cell_types, "_Final_anno.qs")

# Read and filter data
sce_list <- lapply(paste0(base_path, "/", cell_types, "_data_arranged_updatedId_final_batches.qs"), read_data)
annot_list <- lapply(paste0(annotation_path, "/", annotations), read_data)
annot_list <- lapply(annot_list, function(x) filter_annotations(x, batch = 'human_Gerrits_33609158'))


# Assign names to lists
names(sce_list) <- cell_types
names(annot_list) <- cell_types

# Transform sample column, subset SCE objects, and merge annotations
for (cell_type in cell_types) {
  annot_list[[cell_type]]$cell_barcode <- transform_sample(annot_list[[cell_type]]$sample, 4)
  sce_list[[cell_type]] <- subset_sce(sce_list[[cell_type]], annot_list[[cell_type]])
  sce_list[[cell_type]] <- merge_annotations(sce_list[[cell_type]], annot_list[[cell_type]])
}



In [None]:
dat_dir = f'../data/raw/gerrits_otc/anndata/'

if not os.path.exists(dat_dir):
    os.makedirs(dat_dir)

# Define the columns to convert for obs and var
obs_columns_to_convert = ['anno_braak_score', 'anno_orig_cellState']
var_columns_to_convert = ['entrezid']

# Define the SCE objects and their corresponding file paths
sce_objects = sce_list

file_paths = {
    'MG': "../data/raw/gerrits_otc/anndata/microglia_raw_anndata.h5ad",
}

# Loop through SCE objects to convert column types and save them
for sce_name, sce_obj in sce_objects.items():
    convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
    save_anndata(sce_obj, file_paths[sce_name])


In [None]:
metadata = sce_list['MG'].obs.drop_duplicates(subset='title', keep='first')
metadata['pathology_group'] = metadata['sample.group.ch1'].map({'CTR+': "early", 'AD': 'late', 'CTR': 'no'})
metadata['individualID'] = metadata.title.astype(str)

metadata.to_csv('../data/raw/gerrits_otc/gerrits_otc_metadata.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['pathology_group'] = metadata['sample.group.ch1'].map({'CTR+': "early", 'AD': 'late', 'CTR': 'no'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['individualID'] = metadata.title.astype(str)


## **[Mathys et. al. 2019](https://www.nature.com/articles/s41586-019-1195-2) (Prefrontal Cortex)**

In [None]:
%%R -o annot_list

# Paths
annotation_path <- "../data/raw/gazestani_pfc/annotations"

# Cell types and annotations
cell_types <- c("Astro", "Endo", "ExN", "InN", "MG", "Oligo", "OPC")
annotations <- paste0(cell_types, "_Final_anno.qs")

# Read and filter data
annot_list <- lapply(paste0(annotation_path, "/", annotations), read_data)
annot_list <- lapply(annot_list, function(x) filter_annotations(x, batch = 'human_Mathys_31042697'))

# Assign names to lists
names(annot_list) <- cell_types

# Transform sample column, subset SCE objects, and merge annotations
for (cell_type in cell_types) {
  annot_list[[cell_type]]$cell_barcode <- transform_sample(annot_list[[cell_type]]$sample, 4)
}

In [None]:
adata = sc.read_mtx('../data/raw/mathys_pfc/notfiltered_count_matrix.mtx').T

obs_names = pd.read_csv('../data/raw/mathys_pfc/notfiltered_column_metadata.txt', sep='\t')
obs_names.TAG = obs_names.TAG.str.replace('.', '-')
obs_names.set_index('TAG', inplace=True)
obs_names.index.rename('index', inplace=True)

gene_names = pd.read_csv('../data/raw/mathys_pfc/notfiltered_gene_row_names.txt', sep='\t', header=None)

adata.obs = obs_names
adata.var_names = gene_names[1]


sce_list = {}
for cell_type, annot in annot_list.items():
    if len(annot_list[cell_type])>0:
        sce_list[cell_type] = filter_and_update_anndata(adata, annot_list[cell_type])


In [None]:
dat_dir = f'../data/raw/mathys_pfc/anndata/'

if not os.path.exists(dat_dir):
    os.makedirs(dat_dir)

# Define the columns to convert for obs and var
obs_columns_to_convert = ['anno_braak_score']
var_columns_to_convert = None

# Define the SCE objects and their corresponding file paths
sce_objects = sce_list

file_paths = {
    'ExN': "../data/raw/mathys_pfc/anndata/excitatory_raw_anndata.h5ad",
    'InN': "../data/raw/mathys_pfc/anndata/inhibitory_raw_anndata.h5ad",
    'Astro': "../data/raw/mathys_pfc/anndata/astrocyte_raw_anndata.h5ad",
    'MG': "../data/raw/mathys_pfc/anndata/microglia_raw_anndata.h5ad",
    'Oligo': "../data/raw/mathys_pfc/anndata/oligodendrocyte_raw_anndata.h5ad",
    'OPC': "../data/raw/mathys_pfc/anndata/opc_raw_anndata.h5ad",
    'Endo': "../data/raw/mathys_pfc/anndata/endothelial_raw_anndata.h5ad"
}

# Loop through SCE objects to convert column types and save them
for sce_name, sce_obj in sce_objects.items():
    convert_columns_to_string(sce_obj, obs_cols=obs_columns_to_convert, var_cols=var_columns_to_convert)
    save_anndata(sce_obj, file_paths[sce_name])       


In [None]:
metadata = sce_list['Astro'].obs.drop_duplicates(subset='projid', keep='first')
metadata['pathology_group'] = metadata['anno_braak_score'].map({'3': "early", '4': 'early', '5': 'late', '6': 'late', '0': 'no', '1': 'no', '2': 'no'})
metadata['individualID'] = metadata.projid.astype(str)

metadata.to_csv('../data/raw/mathys_pfc/mathys_pfc_metadata.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['pathology_group'] = metadata['anno_braak_score'].map({'3': "early", '4': 'early', '5': 'late', '6': 'late', '0': 'no', '1': 'no', '2': 'no'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['individualID'] = metadata.projid.astype(str)


## **[Leng et. al. 2021](https://www.nature.com/articles/s41593-020-00764-7) (Superior Frontal Gyrus & Entorhinal Cortex)**

In [None]:
%%R -o annot_list

# Paths
annotation_path <- "../data/raw/gazestani_pfc/annotations"

# Cell types and annotations
cell_types <- c("Astro", "Endo", "ExN", "InN", "MG", "Oligo", "OPC")
annotations <- paste0(cell_types, "_Final_anno.qs")

# Read and filter data
annot_list <- lapply(paste0(annotation_path, "/", annotations), read_data)
annot_list <- lapply(annot_list, function(x) filter_annotations(x, batch = 'human_Leng_33432193'))

# Assign names to lists
names(annot_list) <- cell_types

# Transform sample column, subset SCE objects, and merge annotations
for (cell_type in cell_types) {
  annot_list[[cell_type]]$cell_barcode <- transform_sample(annot_list[[cell_type]]$sample, 4)
}

In [None]:
# processed data obtained 
readRDS = robjects.r['readRDS']
df_etc = readRDS('../data/raw/leng_etc/sce.EC.scAlign.assigned.rds')
adata_leng_etc = df_etc

# processed data obtained 
readRDS = robjects.r['readRDS']
df_sfg = readRDS('../data/raw/leng_sfg/sce.SFG.scAlign.assigned.rds')
adata_leng_sfg = df_sfg

ctypes = ['Exc', 'Inh', 'Astro', 'Endo', 'Micro', 'OPC', 'Oligo']

adata_leng_etc.write_h5ad('../data/raw/leng_etc/leng_etc_raw_anndata.h5ad')
adata_leng_sfg.write_h5ad('../data/raw/leng_sfg/leng_sfg_raw_anndata.h5ad')