In [1]:
#All the following script and comments have been made in accordance to single cell  data ##

#Importing packages # Make sure that you activate correct environment (conda activate scvi-env)
import os
import tempfile
import scanpy as sc
import scvi
import seaborn as sns
import torch
from rich import print
from scib_metrics.benchmark import Benchmarker

output = './outputs'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Data loading and reading

lungdata_path = os.path.join("./", "lung_atlas.h5ad")

lungdata= sc.read(
    lungdata_path,
    backup_url="https://figshare.com/ndownloader/files/24539942",
)
lungdata



AnnData object with n_obs × n_vars = 32472 × 15148
    obs: 'dataset', 'location', 'nGene', 'nUMI', 'patientGroup', 'percent.mito', 'protocol', 'sanger_type', 'size_factors', 'sampling_method', 'batch', 'cell_type', 'donor'
    layers: 'counts'

In [3]:
#DATA PREPROCESSING#

#Data backup
lungdata.raw = lungdata 

#Selecting high variable genes
sc.pp.highly_variable_genes(
    lungdata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="batch",
    subset=True,
)



In [4]:
#INTEGRATION WITH SCVII#  Usually when annotated data isnt available or unable to help

#Data labelling by finding common axes in data

#Data setup according to SCVI model
scvi.model.SCVI.setup_anndata(lungdata, layer="counts", batch_key="batch")

#Setting up model with data
model = scvi.model.SCVI(lungdata, n_layers=2, n_latent=30, gene_likelihood="nb")

#Training model
model.train()


For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.

For creation, use `anndata.experimental.sparse_dataset(X)` instead.

  return _abc_instancecheck(cls, instance)
  self.validate_field(adata)
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/srivalli/conda/envs/scvi-env/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/srivalli/conda/envs/scvi-env/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 246/246: 100%|██████████| 246/246 [27:56<00:00,  9.12s/it, v_num=1, train_loss_step=601, train_loss_epoch=552]

`Trainer.fit` stopped: `max_epochs=246` reached.


Epoch 246/246: 100%|██████████| 246/246 [27:56<00:00,  6.81s/it, v_num=1, train_loss_step=601, train_loss_epoch=552]


In [5]:
#Evaluation of latent representation
SCVI_LATENT_KEY = "X_scVI"

#Adding latent rep data to anndata
lungdata.obsm[SCVI_LATENT_KEY] = model.get_latent_representation()

In [6]:
#Clustering data to view in SCVI latent space
#sc.pp.neighbors(lungdata, use_rep=SCVI_LATENT_KEY)
#sc.tl.leiden(lungdata,omp_set_max_active_levels = 5)
lungdata

AnnData object with n_obs × n_vars = 32472 × 2000
    obs: 'dataset', 'location', 'nGene', 'nUMI', 'patientGroup', 'percent.mito', 'protocol', 'sanger_type', 'size_factors', 'sampling_method', 'batch', 'cell_type', 'donor', '_scvi_batch', '_scvi_labels'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', '_scvi_uuid', '_scvi_manager_uuid'
    obsm: 'X_scVI'
    layers: 'counts'

In [7]:
#Data visualization 

#Alternate to UMAP
SCVI_MDE_KEY = "X_scVI_MDE"

#Data preperation
lungdata.obsm[SCVI_MDE_KEY] = scvi.model.utils.mde(lungdata.obsm[SCVI_LATENT_KEY])

#Plotting
sc.pl.embedding(
    lungdata,
    basis=SCVI_MDE_KEY,
    color=["batch"],
    frameon=False,
    ncols=1,
)

[34mINFO    [0m Using cpu for `pymde.preserve_neighbors`.                                                                 


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
#Data visualization based on annotated data
sc.pl.embedding(lungdata, basis=SCVI_MDE_KEY, color=["cell_type"], frameon=False, ncols=1)

In [None]:
#INTEGRATION WITH scANVI # Usually ehen annotated data is available giving whole information

#We specify keys for both interested obs and info of unlablled data also
scanvi_model = scvi.model.SCANVI.from_scvi_model(
    model,
    lungdata=lungdata,
    labels_key="cell_type",
    unlabeled_category="Unknown",
)

In [None]:
#Model training
scanvi_model.train(max_epochs=20, n_samples_per_label=100)

In [None]:
#Data visualization 

#Alternate to UMAP
SCANVI_LATENT_KEY = "X_scANVI"
lungdata.obsm[SCANVI_LATENT_KEY] = scanvi_model.get_latent_representation(lungdata)

#Plotting
sc.pl.embedding(
    lungdata, basis=SCANVI_LATENT_KEY, color=["cell_type"], ncols=1, frameon=False
)

In [None]:
#Data viewing for additional components added
lungdata

In [None]:
#COMPUTING INTEGRATION METRICS#

#Selecting metrics for plotting
bm = Benchmarker(
    lungdata,
    batch_key="batch",
    label_key="cell_type",
    embedding_obsm_keys=["X_pca", SCVI_LATENT_KEY, SCANVI_LATENT_KEY],
    n_jobs=-1,
)

#Viewing metrics
bm.benchmark()

#Plotting results in table
bm.plot_results_table(min_max_scale=False)

#Saving results as dataframe
df = bm.get_results(min_max_scale=False)
print(df)

#Saving as file
df.write(output)