In [None]:
# only for Colab

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/repos/Epilepsy_Microglia
%pip install -q -r requirements.txt

In [None]:
# project name
project = "Epilepsy_Microglia"
dataset = "thrupp"
gse_accession = "GSE153807"

In [None]:
# environment setting
from env_utils import detect_env, get_paths
from sc_utils import sc_load_fix_h5ad, sc_compile_from_dir, sc_compile_from_tar

env = detect_env()
paths = get_paths(project)

# ML Libraries
import torch

# Single Cell Libraries
import scvi
import scanpy as sc
import anndata as ad
from scar import model, setup_anndata
 
from datetime import date
TODAY = date.today()
print("Today's date: ", TODAY)

# Version & sanity check
print(torch.__version__)
print(scvi.__version__)
print(torch.cuda.is_available())
import warnings
warnings.simplefilter("ignore")

# Random key
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
scvi.settings.seed = SEED

Seed set to 42


Today's date:  2025-12-16
2.9.1+cu128
1.3.3
False


In [None]:
# Pathways and setting
sc.set_figure_params(dpi_save=300, frameon=False)
sc.settings.figdir = paths["plots"]

base = paths["base"]
### label = f"{gse_accession}_{TODAY}"

raw_dir = paths["raw"] / dataset
processed_dir = paths["processed"] / dataset

os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

In [None]:
IO_MODE = "dir"  # {"single", "dir", "tar"}

if IO_MODE == "single":
    adata = sc_load_fix_h5ad(raw_path)
elif IO_MODE == "dir":
    adata = sc_compile_from_dir(raw_dir, label="sample", merge="first")
elif IO_MODE == "tar":
    adata = sc_compile_from_tar(raw_dir, label="sample", merge="first")
else:
    raise ValueError("Invalid IO_MODE")

In [None]:
adata.write(processed_dir / f"{dataset}_{TODAY}.h5ad")

In [None]:
adata.var
adata.var_names

#if needs annotation : fill in __old_id__ with original ensembl code column
sc_annotate_mygene(adata, "__old_id__")

In [None]:
# qc
from sc_preprocess import sc_annotate_mygene
from sc_preprocess import sc_add_mt_ribo_hb_qc, sc_plot_qc_mt_rb_hb
from sc_preprocess import sc_build_qc_mask, sc_apply_qc_mask, sc_plot_qc_distributions
import scanpy as sc

genes_threshold = 3
n_mad = 3
mt_threshold = 20
rb_threshold = 80
hb_threshold = 10

adata = sc_add_mt_ribo_hb_qc(adata, copy=True)
sc_plot_qc_mt_rb_hb(adata, dataset, TODAY)

# filter cells with n_mad rule
qc_mask = sc_build_qc_mask(adata, n_mad, mt_threshold, rb_threshold, hb_threshold)

# check plot
sc_plot_qc_distributions(adata, qc_mask, bins=100)

# apply mask
adata = sc_apply_qc_mask(adata, qc_mask, copy=True)

# filter genes
sc.pp.filter_genes(adata, min_cells=genes_threshold)


In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    batch_key="sample_id",
    categorical_covariate_keys=["region"],
)
model = scvi.model.SCVI(adata, n_layers=2, n_latent=20)
model.train()
model_dir = os.path.join(OUT_DIR, "model_v3")
os.makedirs(model_dir, exist_ok=True)
model.save(model_dir, save_anndata=False, overwrite=True)

This is the end of the notebook