**2. Preprocess**

0. Check var_names, make names unique
1. QC (Basic mito, pct)
2. Setup SCVI model
3. Mygene annotation
(3-1. Annotation sanity check)


In [None]:
import torch

# Single Cell Libraries
import scvi
import scanpy as sc
import anndata as ad

# Data Processing and Plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import igraph
import leidenalg

# File grab
import os
import tempfile
import pooch
import shutil, subprocess, glob
import gzip

print(torch.__version__)
print(scvi.__version__)
print(torch.cuda.is_available())

In [None]:
# not for github

RAW_DIR = "/content/drive/MyDrive/datas/epilepsy_microglia/raw/GSE201048_raw/"
OUT_DIR = "/content/drive/MyDrive/datas/epilepsy_microglia/processed/GSE201048"
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
# not for github

tempdir = "/content/plots/"
os.makedirs(tempdir, exist_ok=True)

In [None]:
# not for github

adata = sc.read_h5ad(os.path.join(OUT_DIR, "kumar.h5ad"))
adata

In [None]:
# not for github
adata.write(os.path.join(OUT_DIR, "kumar.h5ad"))

In [None]:
# not for github
adata.var

In [None]:
# not for github
!tar -czf GSE201048_raw.tar.gz /content/drive/MyDrive/datas/epilepsy_microglia/raw/GSE201048_raw

In [None]:
adatas = {}

for file in glob.glob(os.path.join(RAW_DIR, "*.h5ad")):
  sample_adata = sc.read_h5ad(file)
  sample_adata.var_names_make_unique()
  sample_id = sample_adata.obs["sample_id"].iloc[0]
  adatas[sample_id] = sample_adata

adata = ad.concat(
    adatas.values(),
    label="sample_id",
    keys=adatas.keys(),
    merge="same"
)
adata.obs_names_make_unique()
print(adata.obs["sample_id"].value_counts())
adata

In [None]:
# 0. Check var names (ensemble vs symbol)
print(adata.var_names[:10])

In [None]:
# 1. QC
adata.var["mt"] = adata.var_names.str.startswith("MT-")
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)

sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter = 0.4,
    multi_panel=True,
    save = "qc.png"
)
sc.pl.scatter(
    adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt",
    save = "qc.png"
)


In [None]:
# 1
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)

adata.layers["counts"] = adata.X.copy()

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata

sc.pp.highly_variable_genes(
    adata,
    n_top_genes=3000,
    layer="counts",
    flavor="seurat_v3",
)

adata

In [None]:
# 2. Model setup
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    batch_key="sample_id",
    categorical_covariate_keys=[
        "patient_id", "sample_id", "dataset", "sex", "dx", "dx_subtype",
        "region", "hemisphere", "procedure", "protocol"
    ],
    continuous_covariate_keys=[
        "age"
    ]
)

In [None]:
# 2
model = scvi.model.SCVI(adata)
model.train()

In [None]:
# 2
model_dir = os.path.join(OUT_DIR, "model")
os.makedirs(model_dir, exist_ok=True)
model.save(model_dir, save_anndata=False, overwrite=True)

In [None]:
# 3. Annotate ENSG with HGNC symbols

!pip install mygene
import mygene

mg = mygene.MyGeneInfo()

genes = list(adata.var["gene_ids"])
res = mg.querymany(
    genes,
    scopes="ensembl.gene",
    fields="symbol",
    species="human",
    returnall=True
)

# Convert MyGene results into a DataFrame
df = pd.DataFrame(res['out'])

# Extract clean Ensembl IDs (remove version suffixes)
df['ensembl_id'] = df['query'].str.split('.').str[0]

# Keep only rows with a valid symbol
df = df[~df['symbol'].isna()].drop_duplicates(subset='ensembl_id')

# Join by Ensembl ID
adata.var = adata.var.join(df.set_index('ensembl_id')['symbol'], on='gene_ids')

# Rename column for clarity
adata.var.rename(columns={'symbol': 'gene_symbol'}, inplace=True)

# Optional : rescue highly expressed missing genes
rescue_mask = (adata.var['status'] == 'missing') & (adata.var['log1p_total_counts'] > 7.5)
adata.var.loc[rescue_mask, 'status'] = 'rescued'

In [None]:
# 3-1 (Optional) : Sanity check of mapped subset

adata_mapped = adata[:, adata.var['status']=='mapped'].copy()
sc.pp.highly_variable_genes(
    adata_mapped,
    flavor='seurat_v3',
    n_top_genes=3000
)
adata.var['mapped_highly_variable'] = False
adata.var.loc[adata_mapped.var_names[adata_mapped.var['highly_variable']], 'mapped_highly_variable'] = True

# Fisher's exact test - 1. Checking if 'mapped' and 'highly_variable' are independent
!pip install scipy
from scipy.stats import fisher_exact

table = pd.crosstab(
    adata.var['status']=='mapped',
    adata.var['highly_variable']
)

odds, p = fisher_exact(table)
print(table)
print(f"Odds ratio : {odds:.2f}, p-value : {p:2e}")

# Check HVG overlap
hvg_all = set(adata.var_names[adata.var['highly_variable']])
hvg_mapped = set(adata.var_names[adata.var['mapped_highly_variable']])

overlap = len(hvg_all & hvg_mapped)
print(f"HVG overlap: {overlap} / {len(hvg_all)} = {overlap/len(hvg_all):.1%}")