In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/repos/Epilepsy_Microglia
%pip install -q -r requirements.txt

**2. Preprocess**

1. Mygene annotation
(Optional : 2-1. Annotation sanity check)
2. QC (Basic mito, pct)
3. Setup SCVI model

-----------------------------------------
*Logs*

20251020 Create RERUN_PREPROCESS : Unpack .tar.gz to read and concatenate.

20251020 Revise ANNOTATE_MYGENE : Fix logic flow & memory efficiency. Drop sanity check : ~30% of dropout is allowed

20251020 Revise SCVI_SETUP : Fix learning parameters. Reduce keys to prevent overfitting

In [None]:
import torch

# Single Cell Libraries
import scvi
import scanpy as sc
import anndata as ad

# Data Processing and Plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import igraph
import leidenalg
import random

# File grab
import os
import tempfile
import pooch
import shutil, subprocess, glob
import gzip

print(torch.__version__)
print(scvi.__version__)
print(torch.cuda.is_available())

# Random key
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
scvi.settings.seed = SEED

In [None]:
# Make sure RAW_DIR and OUT_DIR are created

# RAW_DIR : where raw count matrices are stored in .tar.gz
# TEMP_DIR : where unpacked raw count matrices are stored
# OUT DIR : where h5ad files and models will be saved

RAW_DIR = "/content/drive/MyDrive/datas/epilepsy_microglia/raw/"
TEMP_DIR = "/content/data/"
OUT_DIR = "/content/drive/MyDrive/datas/epilepsy_microglia/processed/GSE201048"
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

FNAME = "kumar_v2_20251020"

In [None]:
# @title
# not for github

## 0. RERUN PREPROCESS
# C : 20251020

FNAME = "GSE201048_raw"

# 1. Unpack tar from RAW_DIR
import tarfile
src = os.path.join(RAW_DIR, f"{FNAME}.tar.gz")
with tarfile.open(src, "r:gz") as tar:
  for member in tar.getmembers():
    member.name = os.path.basename(member.name)
    tar.extract(member, TEMP_DIR)

# 2. Read from TEMP_DIR
adatas = {}

for file in glob.glob(os.path.join(TEMP_DIR, "*.h5ad")):
  sample_adata = sc.read_h5ad(file)
  sample_adata.var_names_make_unique()
  sample_id = sample_adata.obs["sample_id"].iloc[0]
  adatas[sample_id] = sample_adata

if adatas:
  adata = sc.concat(
      adatas.values(),
      label="sample_id",
      keys=adatas.keys(),
      merge="same"
  )
  adata.obs_names_make_unique()
  print(adata.obs["sample_id"].value_counts())
else:
  print("No .h5ad files found in the directory.")

In [None]:
# not for github
# Download

adata = sc.read_h5ad(os.path.join(OUT_DIR, f"{FNAME}.h5ad"))
adata

In [None]:
# not for github
# Upload

adata.write(os.path.join(OUT_DIR, "kumar_v2_20251020.h5ad"))

In [None]:
## 1. ANNOTATE_MYGENE
# R : 20251020

# Annotate ENSG with HGNC symbols

!pip install mygene
import mygene

mg = mygene.MyGeneInfo()

# 현재 gene_ids는 ensg .. .1, .2 이런 데이터로 되어 있음
# 해당 gene_ids에서 소수점을 제거하고 ensembl_id라는 새로운 column으로 저장하기

adata.var['ensembl_id'] = adata.var['gene_ids'].astype(str).str.split('.').str[0]
genes = adata.var['ensembl_id'].dropna().unique().tolist()

res_list = []
for chunk in [genes[i:i+1000] for i in range(0, len(genes), 1000)]:
    res_part = mg.querymany(chunk, scopes="ensembl.gene", fields="symbol", species="human")
    res_list.extend(res_part)
df = pd.DataFrame(res_list)
df = df[~df['symbol'].isna()].drop_duplicates(subset='query').rename(columns={'query':'ensembl_id'})

# Merge
adata.var['gene_symbol'] = adata.var['ensembl_id'].map(df.set_index('ensembl_id')['symbol'])

# Drop NaNs
adata = adata[:, ~adata.var['gene_symbol'].isna()].copy()
adata.var_names_make_unique()
adata.var

In [None]:
# 1-1 (Optional) : Sanity check of mapped subset

adata_mapped = adata[:, adata.var['status']=='mapped'].copy()
sc.pp.highly_variable_genes(
    adata_mapped,
    flavor='seurat_v3',
    n_top_genes=3000
)
adata.var['mapped_highly_variable'] = False
adata.var.loc[adata_mapped.var_names[adata_mapped.var['highly_variable']], 'mapped_highly_variable'] = True

# Fisher's exact test - 1. Checking if 'mapped' and 'highly_variable' are independent
!pip install scipy
from scipy.stats import fisher_exact

table = pd.crosstab(
    adata.var['status']=='mapped',
    adata.var['highly_variable']
)

odds, p = fisher_exact(table)
print(table)
print(f"Odds ratio : {odds:.2f}, p-value : {p:2e}")

# Check HVG overlap
hvg_all = set(adata.var_names[adata.var['highly_variable']])
hvg_mapped = set(adata.var_names[adata.var['mapped_highly_variable']])

overlap = len(hvg_all & hvg_mapped)
print(f"HVG overlap: {overlap} / {len(hvg_all)} = {overlap/len(hvg_all):.1%}")

In [None]:
## 2. QC

adata.var["mt"] = adata.var_names.str.startswith("MT-")
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)

sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter = 0.4,
    multi_panel=True,
    save = "qc.png"
)
sc.pl.scatter(
    adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt",
    save = "qc.png"
)

sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)

adata.layers["counts"] = adata.X.copy()

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata

sc.pp.highly_variable_genes(
    adata,
    n_top_genes=3000,
    layer="counts",
    flavor="seurat_v3",
    batch_key="dataset"
)

adata


In [None]:
sc.pp.pca(adata, random_state=SEED)
sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_pca", random_state=SEED)
sc.tl.umap(adata, random_state=SEED)
sc.tl.leiden(adata, resolution=0.5, random_state=SEED)
sc.pl.umap(adata, color=["leiden"])

In [None]:
sc.pl.umap(adata, color=["PTPRC", "P2RY12", "CD14"], save="_pca_CD45_P2RY12_CD14.png")

In [None]:
# 3. Model setup
# R : 20251020

scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    batch_key="sample_id",
    categorical_covariate_keys="region",
)

In [None]:
# 3
model = scvi.model.SCVI(adata)
model.train()

In [None]:
# 3
model_dir = os.path.join(OUT_DIR, "model_v2")
os.makedirs(model_dir, exist_ok=True)
model.save(model_dir, save_anndata=False, overwrite=True)

In [None]:
# @title
## Dumped 20251020 : too many keys erase biology.
# 2. Model setup
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    batch_key="sample_id",
    categorical_covariate_keys=[
        "patient_id", "sample_id", "dataset", "sex", "dx", "dx_subtype",
        "region", "hemisphere", "procedure", "protocol"
    ],
    continuous_covariate_keys=[
        "age"
    ]
)

In [None]:
# @title
# Dumped 20251020 : too many logical flaws
# 2. Annotate ENSG with HGNC symbols

!pip install mygene
import mygene

mg = mygene.MyGeneInfo()

genes = list(adata.var["gene_ids"])
res = mg.querymany(
    genes,
    scopes="ensembl.gene",
    fields="symbol",
    species="human",
    returnall=True
)

# Convert MyGene results into a DataFrame
df = pd.DataFrame(res.get('out', []))

# Extract clean Ensembl IDs (remove version suffixes)
df['ensembl_id'] = df['query'].str.split('.').str[0]

# Keep only rows with a valid symbol
df = df[~df['symbol'].isna()].drop_duplicates(subset='ensembl_id')

# Join by Ensembl ID
adata.var = adata.var.join(df.set_index('ensembl_id')['symbol'], on='ensembl_id')

# Rename column for clarity
adata.var.rename(columns={'symbol': 'gene_symbol'}, inplace=True)

# Optional : rescue highly expressed missing genes
rescue_mask = (adata.var['status'] == 'missing') & (adata.var['log1p_total_counts'] > 7.5)
adata.var.loc[rescue_mask, 'status'] = 'rescued'