In [1]:
import itertools
import os
import re

import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import sklearn.metrics
import sklearn.neural_network
from tqdm import tqdm


prefix = '/mnt/d/PsychAD'  # 'D:/PsychAD'

  machar = _get_machar(dtype)


# Data Preprocessing

In [None]:
# Example dataset
# pd.read_csv('/mnt/c/Users/nck/repos/scGNN/GSE138852/GSE138852_counts.csv.gz', index_col=0, compression='gzip')

In [None]:
# Load data
adata_psychad = sc.read_h5ad(os.path.join(prefix, 'psychAD_snRNAseq_rawCounts.h5ad'), backed='r')
adata_seaad = sc.read_h5ad(os.path.join(prefix, 'SEAAD_A9_RNAseq_final-nuclei.2024-02-13.h5ad'), backed='r')  # Could be loaded more efficiently using h5py
adata_seaad = ad.AnnData(adata_seaad.X, obs=adata_seaad.obs, var=adata_seaad.var)  # Remove unnecessary data, listed below
# adata_seaad.layers['UMIs']
# adata_seaad.obsp['connectivities']
# adata_seaad.obsp['distances']

# Remove HBCC from training
mask_psychad = adata_psychad.obs['Source'] == 'HBCC'
mask_psychad = ~mask_psychad
# adata_psychad = adata_psychad[mask_psychad] # Can't apply because indexing backed view with mask just creates whole matrix


In [None]:
# Stratification params
stratifying_cols_psychad = ['Individual_ID', 'subclass']  # Implied stratification by source and class
stratifying_cols_seaad = ['Donor ID', 'Subclass']

# Neat params
adata_names = ['PsychAD', 'SEA-AD']
adatas = [adata_psychad, adata_seaad]
stratifying_cols_list = [stratifying_cols_psychad, stratifying_cols_seaad]
masks = [mask_psychad, None]

# Aggregate cells with scanpy
# adata.obs['strat'] = adata.obs.apply(lambda r: ' - '.join([r[col] for col in stratifying_cols]), axis=1)
# adata_pseudobulk = sc.get.aggregate(adata, by='strat', func='sum', axis='obs')  # Not implemented

# Manually perform stratification
for adata_name, adata, stratifying_cols, mask_adata in zip(adata_names, adatas, stratifying_cols_list, masks):
    unique_strat_vals = [np.unique(adata.obs[col]) for col in stratifying_cols]
    total_num = np.prod([usv.shape[0] for usv in unique_strat_vals])
    pseudo_data = []; strat_names = []
    for i, stratification in tqdm(enumerate(itertools.product(*unique_strat_vals)), desc=adata_name, total=np.prod([len(usv) for usv in unique_strat_vals])):
        # Get mask
        mask = np.ones(adata.X.shape[0], dtype=bool)
        if mask_adata is not None: mask *= mask_adata
        for col, strat in zip(stratifying_cols, stratification):
            mask *= (adata.obs[col] == strat).to_numpy()
        
        # Continue if no samples found
        if mask.sum() < 1: continue

        # CLI
        # print(f'{i+1}/{total_num}\t{stratification}: {mask.sum()}')

        # Add name
        name = '_'.join(stratification)
        strat_names.append(name)

        # Aggregate samples, sum for pseudobulk
        raw = adata.X[np.argwhere(mask).flatten()]
        processed = raw.sum(axis=0)
        pseudo_data.append(processed)

    # Format data
    pseudo_data = np.stack(pseudo_data, axis=0)

    # Save to file
    df = pd.DataFrame(data=pseudo_data.T, index=adata.var_names.to_numpy(), columns=strat_names)
    df.to_csv(f'D:/PsychAD/pseudobulk_{name}.csv.gz', compression='gzip')
    # pd.read_csv('/mnt/d/PsychAD/pseudobulk.csv.gz', index_col=0, compression='gzip')

# Read files
pseudo_psychad = pd.read_csv(os.path.join(prefix, 'pseudobulk_PsychAD.csv.gz'), compression='gzip')
pseudo_seaad = pd.read_csv(os.path.join(prefix, 'pseudobulk_SEA.csv.gz'), compression='gzip')

# Construct and save common
pseudo_common = pd.merge(pseudo_psychad, pseudo_seaad, left_index=True, right_index=True, how='inner')
pseudo_common.to_csv(os.path.join(prefix, 'pseudobulk_common.csv.gz'), compression='gzip')

In [None]:
# Save ct labels for scGNNv2
fnames = ['pseudobulk_PsychAD.csv.gz', 'pseudobulk_SEA-AD.csv.gz', 'pseudobulk_common.csv.gz']
for fname in fnames:
    df = pd.read_csv(os.path.join(prefix, fname), nrows=0, index_col=0).T
    # Get cell types
    df['Cell Type'] = [re.search('^(AMPAD_(?:HBCC|MSSM)_\d+M?\d+|H\d+.\d+.\d+|R\d+)_(.+)$', idx).group(2) for idx in df.index]
    # Translate cell types
    ct_translation = {
        'Astrocyte': 'Astro',
        # 'Chandelier': '',
        'Endothelial': 'Endo',
        'L2/3 IT': 'EN_L2_3_IT',
        # 'L4 IT': '',
        'L5 ET': 'EN_L5_ET',
        # 'L5 IT': '',
        'L5/6 NP': 'EN_L5_6_NP',
        'L6 CT': 'EN_L6_CT',
        'L6 IT': 'EN_L6_IT',  # Unclear
        'L6 IT Car3': 'EN_L6_IT',  # Unclear
        'EN_L6_IT_1': 'EN_L6_IT',  # PAD
        'EN_L6_IT_2': 'EN_L6_IT',  # PAD
        'L6b': 'EN_L6B',
        # 'Lamp5': '',
        'Lamp5 Lhx6': 'IN_LAMP5_LHX6',
        'Microglia-PVM': 'Micro',  # Unclear, PVM
        'PVM': 'Micro',  # PAD
        # 'OPC': 'OPC',
        'Oligodendrocyte': 'Oligo',
        # 'Pax6': '',
        'Pvalb': 'IN_PVALB',
        # 'Sncg': '',
        'Sst': 'IN_SST',
        'Sst Chodl': 'IN_SST',
        # 'VLMC': 'VLMC',
        'Vip': 'IN_VIP'}
    df['Cell Type'] = df['Cell Type'].apply(lambda x: ct_translation[x] if x in ct_translation else x)
    # Save cell types
    df.to_csv(os.path.join(prefix, ''.join(fname.split('.')[:-2]) + '_ct.csv'))

In [None]:
# split_idx = np.argwhere([psd.shape[1] == 34890 for psd in pseudo_data]).max()
# # PsychAD
# psd = np.concatenate(pseudo_data[:split_idx+1], axis=0)
# df = pd.DataFrame(data=psd.T, index=adata_psychad.var_names.to_numpy(), columns=strat_names[:split_idx+1])
# df.to_csv(f'D:/PsychAD/pseudobulk_PsychAD.csv.gz', compression='gzip')
# psd = np.concatenate(pseudo_data[split_idx+1:], axis=0)
# df = pd.DataFrame(data=psd.T, index=adata_seaad.var_names.to_numpy(), columns=strat_names[split_idx+1:])
# df.to_csv(f'D:/PsychAD/pseudobulk_SEA-AD.csv.gz', compression='gzip')

# Results

In [2]:
# Load PsychAD Meta
meta_PsychAD = pd.read_csv(os.path.join(prefix, 'clinical_metadata.csv')).set_index('SubID_export_synapse')[['Brain_bank', 'AD']]
meta_PsychAD.index.name = None
meta_PsychAD = meta_PsychAD.rename(columns={'Brain_bank': 'Brain Bank'})
meta_PsychAD['AD'] = meta_PsychAD['AD'].fillna(0)
# Load SEA-AD meta
# pd.set_option('display.max_columns', None)
meta_SEAAD = pd.read_csv(os.path.join(prefix, 'SEAAD_A9_RNAseq_final-nuclei_metadata.2024-02-13.csv')).set_index('Donor ID')
meta_SEAAD.index.name = None
meta_SEAAD = meta_SEAAD[~meta_SEAAD.index.duplicated(keep='first')]
# If reference is AD, then the only discrepancy is intermediate +8 samples
# np.unique(meta_SEAAD.groupby('Donor ID').first()[['Overall AD neuropathological Change']], return_counts=True)
meta_SEAAD['AD'] = meta_SEAAD['Overall AD neuropathological Change'].apply(lambda x: x not in ('Not AD',))
meta_SEAAD['Brain Bank'] = 'SEAAD'
meta_SEAAD = meta_SEAAD[['Brain Bank', 'AD']]
# Concatenate
meta = pd.concat([meta_PsychAD, meta_SEAAD])
del meta_PsychAD, meta_SEAAD

In [None]:
# Load embeddings
result_name = ['scGNNv1/PsychAD_All', 'scGNNv1/PsychAD', 'scGNNv1/PsychAD_SEAAD'][2]
embeddings_fname = os.path.join(prefix, result_name, 'outputdir', 'PsychAD_embedding.csv')
embeddings = pd.read_csv(embeddings_fname, index_col=0)
# Take mean embedding for each individual
embeddings['id'] = [re.search('^(AMPAD_(?:HBCC|MSSM)_\d+M?\d+|H\d+.\d+.\d+|R\d+)_(.+)$', idx).group(1) for idx in embeddings.index]
embeddings = embeddings.groupby('id').mean()
embeddings, embedding_ids = embeddings.to_numpy(), embeddings.index.to_numpy()

# Filter and sort meta
meta_filter = meta.loc[embedding_ids]

# Get training IDs
mssm_holdout = pd.read_csv(os.path.join(prefix, 'node_imp_score.csv'), index_col=0)
training_sample_ids = mssm_holdout['sample'].unique()
training_sample_ids = np.array([f'AMPAD_MSSM_{int(sid[1:]):010}' for sid in training_sample_ids])
mask_train = meta_filter.index.isin(training_sample_ids)

# Define data
X = embeddings
y = meta_filter['AD']
mask_test_MSSM = (meta_filter['Brain Bank'] == 'MSSM') * ~mask_train
mask_test_EXT = meta_filter['Brain Bank'].isin(['RUSH', 'SEAAD']) * ~mask_train
X_train, y_train = X[mask_train], y[mask_train]
X_test_MSSM, y_test_MSSM = X[mask_test_MSSM], y[mask_test_MSSM]
X_test_EXT, y_test_EXT = X[mask_test_EXT], y[mask_test_EXT]

# Fit MLP
mlp = sklearn.neural_network.MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)

# Get accuracies
iterable = [
    ('Training', X_train, y_train),
    ('MSSM Heldout', X_test_MSSM, y_test_MSSM),
    ('RUSH + SEAAD', X_test_EXT, y_test_EXT),
]
for name, X_test, y_test in iterable:
    # Predict MLP
    y_prob = mlp.predict_proba(X_test)[:, mlp.classes_==1.].flatten()
    y_pred = (y_prob >= .5)*1.

    # Get AUC and BACC
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_prob)
    auc = sklearn.metrics.auc(fpr, tpr)
    bacc = sklearn.metrics.balanced_accuracy_score(y_test, y_pred)

    # CLI
    print(f'{name}: AUC ({auc:.3f}), BACC ({bacc:.3f})')

Training: AUC (0.544), BACC (0.508)
MSSM Heldout: AUC (0.632), BACC (0.573)
RUSH + SEAAD: AUC (0.312), BACC (0.494)
