In [None]:
import torch

# Single Cell Libraries
import scvi
import scanpy as sc
import anndata as ad

# Data Processing and Plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import igraph
import leidenalg

# File grab
import os
import tempfile
import pooch
import shutil, subprocess, glob
import gzip

print(torch.__version__)
print(scvi.__version__)
print(torch.cuda.is_available())

In [None]:
# not for github

tempdir = "/content/drive/MyDrive/datas/plots"
os.makedirs(tempdir, exist_ok=True)

In [None]:
# not for github

SOURCEDIR = "/content/drive/MyDrive/datas/epilepsy_microglia/processed/GSE201048"
FILE_LABEL = "kumar"

In [None]:
# not for github
adata.write(os.path.join(SOURCEDIR, "kumar.h5ad"))

In [None]:
adata = sc.read_h5ad(os.path.join(SOURCEDIR,f"{FILE_LABEL}.h5ad"))
model_dir = os.path.join(SOURCEDIR,"model")
model = scvi.model.SCVI.load(model_dir, adata=adata)

In [None]:
SCVI_LATENT_KEY = "X_scVI"
latent = model.get_latent_representation()
adata.obsm[SCVI_LATENT_KEY] = latent
latent.shape

In [None]:
# basic downstream

SEED = 42

sc.pp.neighbors(adata, use_rep="X_scVI", random_state=SEED)
sc.tl.umap(adata, random_state=SEED)
sc.tl.leiden(adata, resolution=0.5, random_state=SEED)
sc.pl.umap(adata, color=["leiden"], save="_kumar_leiden0.5.png")

In [None]:
adata.var.head()

In [None]:
# mygene으로 ensg 코드를 annotation한다

!pip install mygene
import mygene

mg = mygene.MyGeneInfo()

genes = list(adata.var["gene_ids"])
res = mg.querymany(
    genes,
    scopes="ensembl.gene",
    fields="symbol",
    species="human",
    returnall=True
)

In [None]:
if 'symbol' in adata.var.columns:
    adata.var = adata.var.drop(columns='symbol')

In [None]:
# Convert MyGene results into a DataFrame
df = pd.DataFrame(res['out'])

# Extract clean Ensembl IDs (remove version suffixes)
df['ensembl_id'] = df['query'].str.split('.').str[0]

# Keep only rows with a valid symbol
df = df[~df['symbol'].isna()].drop_duplicates(subset='ensembl_id')

# Join by Ensembl ID
adata.var = adata.var.join(df.set_index('ensembl_id')['symbol'], on='gene_ids')

# Rename column for clarity
adata.var.rename(columns={'symbol': 'gene_symbol'}, inplace=True)

In [None]:
missing_mask = adata.var['gene_symbol'].isna()
adata.var['status'] = np.where(missing_mask, 'missing', 'mapped')

sns.violinplot(
    data=adata.var,
    x='status',
    y='log1p_total_counts',
    inner='box'
)
plt.show()

In [None]:
# Rescue highly expressed missing genes
rescue_mask = (adata.var['status'] == 'missing') & (adata.var['log1p_total_counts'] > 7.5)

# Update status to 'rescued'
adata.var.loc[rescue_mask, 'status'] = 'rescued'

In [None]:
display(adata.var['status'].value_counts()['rescued'])

In [None]:
adata.var

In [None]:
display(adata.var[~adata.var.gene_symbol.isna()])

In [None]:
print(os.path.join(SOURCEDIR, "kumar.h5ad"))

In [None]:
# not for github
adata.write(os.path.join(SOURCEDIR, "kumar_20251006.h5ad"))

In [None]:
os.remove(os.path.join(SOURCEDIR, "kumar.h5ad"))