In [22]:
#All the following script and comments have been made in accordance to single cell  data ##

#Importing packages # Make sure that you activate correct environment (conda activate scvi-env)
import os
import tempfile
import scanpy as sc
import scvi
import seaborn as sns
import torch
import scikit

scvi.settings.seed = 0

sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
torch.set_float32_matmul_precision("high")
save_dir = tempfile.TemporaryDirectory()

ModuleNotFoundError: No module named 'scikit'

In [None]:
#LOADING AND DATA PREPERATION#
adata = sc.read_h5ad('/Users/srivalli/Desktop/Heart/hca_heart_immune_download.h5ad')
adata

#Data filtration
sc.pp.filter_genes(adata, min_counts=3)

#Data normalizartion by adding counts and storing in raw for backup
adata.layers["counts"] = adata.X.copy()  
sc.pp.normalize_total(adata, target_sum=1e4) #10000 genes per cell
sc.pp.log1p(adata)
adata.raw = adata 


In [None]:
#Feature selection i.e., Highly Variable Genes selection
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=1200,
    subset=True,
    layer="counts",
    flavor="seurat_v3",
    batch_key="cell_source",
)

In [None]:
#Annotating data 

scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    categorical_covariate_keys=["cell_source", "donor"],
    continuous_covariate_keys=["percent_mito", "percent_ribo"],
)

In [None]:
##CREATING AND TRAINING MODEL##

#Creating model
model = scvi.model.SCVI(adata)
model

#Training
model.train()


In [None]:
#Saving model
model_dir = os.path.join(save_dir.name, "scvi_model")
model.save(model_dir, overwrite=True)

#Loading model
model = scvi.model.SCVI.load(model_dir, adata=adata)

In [None]:
##OBTAINING MODEL OUTPUTS##

#Storing outputs of scvi to anndata
SCVI_LATENT_KEY = "X_scVI"

latent = model.get_latent_representation()
adata.obsm[SCVI_LATENT_KEY] = latent
latent.shape

In [None]:
#Subsetting data based on celltype
adata_subset = adata[adata.obs.cell_type == "Fibroblast"]
latent_subset = model.get_latent_representation(adata_subset)
latent.shape

In [None]:
#Obtaining normalized expression values
denoised = model.get_normalized_expression(adata_subset, library_size=1e4)
denoised.iloc[:5, :5]

#Storing back normalized values back to anndata
SCVI_NORMALIZED_KEY = "scvi_normalized"
adata.layers[SCVI_NORMALIZED_KEY] = model.get_normalized_expression(library_size=10e4)

In [None]:
##INTEROPERABOILITY WITH SCANPY##

#Visualization without batch correction

# Dimensionality reduction and UMAP generation
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=30, n_neighbors=20)
sc.tl.umap(adata, min_dist=0.3)

#Only cell type
sc.pl.umap(
    adata,
    color=["cell_type"],
    frameon=False,
)

#Mapping based on cell source and donor
sc.pl.umap(
    adata,
    color=["donor", "cell_source"],
    ncols=2,
    frameon=False,
)


In [None]:
#Visualization with batch correction

# Using scVI latent space for UMAP generation
sc.pp.neighbors(adata, use_rep=SCVI_LATENT_KEY)
sc.tl.umap(adata, min_dist=0.3)

#Only cell type
sc.pl.umap(
    adata,
    color=["cell_type"],
    frameon=False,
)

#Mapping based on cell source and donor
sc.pl.umap(
    adata,
    color=["donor", "cell_source"],
    ncols=2,
    frameon=False,
)

In [None]:
#Clustering on the scVI latent space

# Loading neighbors calculated by scvi
SCVI_CLUSTERS_KEY = "leiden_scVI"
sc.tl.leiden(adata, key_added=SCVI_CLUSTERS_KEY, resolution=0.5)

sc.pl.umap(
    adata,
    color=[SCVI_CLUSTERS_KEY],
    frameon=False,
)


In [None]:
##DIFFERENTIAL EXPRESSION##

#Getting headers  of cell types
adata.obs.cell_type.head()

#Differential expression of 1 vs 1
de_df = model.differential_expression(
    groupby="cell_type", group1="Endothelial", group2="Fibroblast"
)
de_df.head()

#Differential expression of 1 vs all
de_df = model.differential_expression(
    groupby="cell_type",
)
de_df.head()

In [None]:
#Extracting top markers for each cluster
markers = {}
cats = adata.obs.cell_type.cat.categories
for i, c in enumerate(cats):
    cid = f"{c} vs Rest"
    cell_type_df = de_df.loc[de_df.comparison == cid]

    cell_type_df = cell_type_df[cell_type_df.lfc_mean > 0]

    cell_type_df = cell_type_df[cell_type_df["bayes_factor"] > 3]
    cell_type_df = cell_type_df[cell_type_df["non_zeros_proportion1"] > 0.1]

    markers[c] = cell_type_df.index.tolist()[:3]

#Dendogram
sc.tl.dendrogram(adata, groupby="cell_type", use_rep="X_scVI")

#Dot plot
sc.pl.dotplot(
    adata,
    markers,
    groupby="cell_type",
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#Heatmap visualization with normalized values as layers
sc.pl.heatmap(
    adata,
    markers,
    groupby="cell_type",
    layer="scvi_normalized",
    standard_scale="var",
    dendrogram=True,
    figsize=(8, 12),
)