In [1]:
#All the following script and comments have been made in accordance to single cell  data ##

#Importing packages # Make sure that you activate correct environment
import os
import tempfile
import scanpy as sc
import scvi
import seaborn as sns
import torch

scvi.settings.seed = 0

sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
torch.set_float32_matmul_precision("high")
save_dir = "/Users/srivalli/Desktop/SCA-Uni/Single-cell-data-analysis/SCVI/outputs"

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 0


In [2]:
#LOADING AND DATA PREPERATION#
adata = sc.read_h5ad('./hca_heart_immune_download.h5ad')
adata

#Data filtration
sc.pp.filter_genes(adata, min_counts=3)

#Data normalizartion by adding counts and storing in raw for backup
adata.layers["counts"] = adata.X.copy()  
sc.pp.normalize_total(adata, target_sum=1e6) #1000000 genes per cell
sc.pp.log1p(adata)
adata.raw = adata 

In [3]:
#Feature selection i.e., Highly Variable Genes selection
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=1200,
    subset=True,
    layer="counts",
    flavor="seurat_v3",
    batch_key="cell_source",
)

In [4]:
#Annotating data 

scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    categorical_covariate_keys=["cell_source", "donor"],
    continuous_covariate_keys=["percent_mito", "percent_ribo"],
)


For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.

For creation, use `anndata.experimental.sparse_dataset(X)` instead.

  return _abc_instancecheck(cls, instance)


In [5]:
##CREATING AND TRAINING MODEL##

#Creating model
model = scvi.model.SCVI(adata)
model

#Training
model.train()

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/srivalli/conda/envs/scvi-env/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/srivalli/conda/envs/scvi-env/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 29/196:  14%|█▍        | 28/196 [01:58<12:08,  4.34s/it, v_num=1, train_loss_step=309, train_loss_epoch=293]

In [None]:
#Saving model
model_dir = os.path.join(save_dir, "scvi_model")
model.save(model_dir, overwrite=True)

#Loading model
model = scvi.model.SCVI.load(model_dir, adata=adata)

[34mINFO    [0m File [35m/Users/srivalli/Desktop/SCA-Uni/Single-cell-data-analysis/SCVI/outputs/scvi_model/[0m[95mmodel.pt[0m already   
         downloaded                                                                                                



For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.

For creation, use `anndata.experimental.sparse_dataset(X)` instead.

  return _abc_instancecheck(cls, instance)


In [None]:
##OBTAINING MODEL OUTPUTS##

#Storing outputs of scvi to anndata
SCVI_LATENT_KEY = "X_scVI"

latent = model.get_latent_representation()
adata.obsm[SCVI_LATENT_KEY] = latent
latent.shape

(40868, 10)

In [None]:
#Subsetting data based on celltype
adata_subset = adata[adata.obs.scNym == "CD4+T_cell"]
latent_subset = model.get_latent_representation(adata_subset)
latent.shape

[34mINFO    [0m Received view of anndata, making copy.                                                                    
[34mINFO    [0m Input AnnData not setup with scvi-tools. attempting to transfer AnnData setup                             


(40868, 10)

In [None]:
#Obtaining normalized expression values
denoised = model.get_normalized_expression(adata_subset, library_size=1e6)
denoised.iloc[:5, :5]

#Storing back normalized values back to anndata in a key
SCVI_NORMALIZED_KEY = "scvi_normalized"
adata.layers[SCVI_NORMALIZED_KEY] = model.get_normalized_expression(library_size=1e6)

In [None]:
##INTEROPERABOILITY WITH SCANPY##

#Visualization without batch correction

# Dimensionality reduction and UMAP generation
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=30, n_neighbors=20)
sc.tl.umap(adata, min_dist=0.3)

#Only cell type
sc.pl.umap(
    adata,
    color=["cell_source"],
    frameon=False,
)

#Mapping based on cell source and donor
sc.pl.umap(
    adata,
    color=["donor", "cell_source"],
    ncols=2,
    frameon=False,
)


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
#Clustering on the scVI latent space

# Loading neighbors calculated by scvi
SCVI_CLUSTERS_KEY = "leiden_scVI"
sc.tl.leiden(adata, key_added=SCVI_CLUSTERS_KEY, resolution=0.5)

sc.pl.umap(
    adata,
    color=[SCVI_CLUSTERS_KEY],
    frameon=False,
)

In [None]:
##DIFFERENTIAL EXPRESSION##

#Getting headers  of cell types
adata.obs.cell_source.head()

#Differential expression of 1 vs 1
de_df = model.differential_expression(
    groupby="cell_source", group1="Harvard-Nuclei", group2="Sanger-CD45"
)
de_df.head()

In [None]:
#Differential expression of 1 vs all
de_df = model.differential_expression(
    groupby="cell_source",
)
de_df.head()

In [None]:
#Extracting top markers for each cluster
markers = {}
cats = adata.obs.cell_source.cat.categories
for i, c in enumerate(cats):
    cid = f"{c} vs Rest"
    cell_source_df = de_df.loc[de_df.comparison == cid]

    cell_source_df = cell_source_df[cell_source_df.lfc_mean > 0]

    cell_source_df = cell_source_df[cell_source_df["bayes_factor"] > 3]
    cell_source_df = cell_source_df[cell_source_df["non_zeros_proportion1"] > 0.1]

    markers[c] = cell_source_df.index.tolist()[:3]

#Dendogram
sc.tl.dendrogram(adata, groupby="cell_source", use_rep="X_scVI")

#Dot plot
sc.pl.dotplot(
    adata,
    markers,
    groupby="cell_source",
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#Heatmap visualization with normalized values as layers
sc.pl.heatmap(
    adata,
    markers,
    groupby="cell_source",
    layer="scvi_normalized",
    standard_scale="var",
    dendrogram=True,
    figsize=(8, 12),
)