In [1]:
import os

import anndata as ad
import pandas as pd
# import scanpy as sc
import simba as si

prefix = '/mnt/d/PsychAD'

# Get Data

In [2]:
# Load pseudocells
dataset_name = 'PsychAD'
data = pd.read_csv(os.path.join(prefix, f'pseudobulk_{dataset_name}.csv.gz'), index_col=0, compression='gzip').T
adata = ad.AnnData(data)

# # Load from source
# import scanpy as sc
# adata_psychad = sc.read_h5ad(os.path.join(prefix, 'psychAD_snRNAseq_rawCounts.h5ad'), backed='r')
# # adata_seaad = sc.read_h5ad(os.path.join(prefix, 'SEAAD_A9_RNAseq_final-nuclei.2024-02-13.h5ad'), backed='r')
# # adata_seaad = ad.AnnData(adata_seaad.X, obs=adata_seaad.obs, var=adata_seaad.var)
# # Remove HBCC from training
# mask_psychad = adata_psychad.obs['Source'] == 'HBCC'
# mask_psychad = ~mask_psychad

# Run SIMBA

In [3]:
# Filter
si.pp.filter_genes(adata, min_n_cells=3)
si.pp.cal_qc_rna(adata)
si.pp.filter_cells_rna(adata, min_n_genes=100)  # 100

In [4]:
# Normalize
si.pp.normalize(adata, method='lib_size')
si.pp.log_transform(adata)

In [5]:
# Variable gene selection
si.pp.select_variable_genes(adata, n_top_genes=2000)
si.pl.variable_genes(adata, show_texts=True)

In [6]:
# Discretize
si.tl.discretize(adata, n_bins=5)

In [None]:
# Generate training graph
si.tl.gen_graph(
    list_CG=[adata],
    layer='simba',
    use_highly_variable=False,
    dirname='graph0')

In [8]:
# Train model
si.tl.pbg_train(auto_wd=True, save_wd=True, output='model')

In [None]:
# Load graph and model
si.load_graph_stats()
si.load_pbg_config()

In [None]:
# Load embeddings
dict_adata = si.read_embedding()
adata_C = dict_adata['C']  # Cell embeddings
adata_G = dict_adata['G']  # Gene embeddings

In [None]:
dataset_name = 'PsychAD'

In [None]:
# Save embeddings
adata_C.X.to_csv(os.path.join(prefix, f'SIMBA/{dataset_name}.csv'))