### Notebook for the label transfer to blood samples using `scANVI`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- v230710

### Import required modules

In [1]:
import scvi
import anndata
import warnings
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

  self.seed = seed
  self.dl_pin_memory_gpu_training = (


### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.9.1
scanpy      1.9.3
-----
PIL                 10.0.0
absl                NA
aiohttp             3.8.4
aiosignal           1.3.1
anyio               NA
appnope             0.1.3
asttokens           NA
async_timeout       4.0.2
attr                23.1.0
backcall            0.2.0
bs4                 4.12.2
certifi             2023.05.07
charset_normalizer  3.2.0
chex                0.1.7
click               8.1.4
comm                0.1.3
contextlib2         NA
croniter            NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.7
decorator           5.1.1
deepdiff            6.3.1
docrep              0.3.2
etils               1.3.0
executing           1.2.0
fastapi             0.100.0
flax                0.7.0
frozenlist          1.3.3
fsspec              2023.6.0
h5py                3.9.0
idna                3.4
importlib_resources NA
ipykernel           6.24.0
ipywidgets          8.0.7
jax                 0.4.

In [3]:
warnings.simplefilter(action = 'ignore')
scvi.settings.seed = 1712
%config InlineBackend.print_figure_kwargs = {'facecolor' : "w"}
%config InlineBackend.figure_format = 'retina'

Global seed set to 1712


In [4]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

### Read in Healthy data

In [5]:
SCC0120_1_blood = sc.read_h5ad('../data/SCC0120_1_Blood_scANVI_states_ctl230704.h5ad')
SCC0120_1_blood

AnnData object with n_obs × n_vars = 1360 × 31908
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'gene_ids-SCC0120_1_pbmc', 'feature_types-SCC0120_1_pbmc', 'mt-SCC0120_1_pbmc', 'ribo-SCC0120_1_pbmc', 'n_cells_by_counts-SCC0120_1_pbmc', 'mean_counts-SCC0120_1_pbmc', 'pct_

In [6]:
SCC0120_1_blood.obs['sample'].cat.categories

Index(['pbmc_1', 'pbmc_2'], dtype='object')

In [7]:
SCC0120_1_PBMC = SCC0120_1_blood[SCC0120_1_blood.obs['sample'].isin(['pbmc_1'])]
SCC0120_1_PBMC

View of AnnData object with n_obs × n_vars = 833 × 31908
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'gene_ids-SCC0120_1_pbmc', 'feature_types-SCC0120_1_pbmc', 'mt-SCC0120_1_pbmc', 'ribo-SCC0120_1_pbmc', 'n_cells_by_counts-SCC0120_1_pbmc', 'mean_counts-SCC0120_1_pbmc'

In [8]:
SCC0120_2_PBMC = SCC0120_1_blood[SCC0120_1_blood.obs['sample'].isin(['pbmc_2'])]
SCC0120_2_PBMC

View of AnnData object with n_obs × n_vars = 527 × 31908
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'gene_ids-SCC0120_1_pbmc', 'feature_types-SCC0120_1_pbmc', 'mt-SCC0120_1_pbmc', 'ribo-SCC0120_1_pbmc', 'n_cells_by_counts-SCC0120_1_pbmc', 'mean_counts-SCC0120_1_pbmc'

In [9]:
SCC0120_1_PBMC.obs['seed_labels'] = SCC0120_1_PBMC.obs['C_scANVI'].copy()
SCC0120_2_PBMC.obs['seed_labels'] = 'Unknown'

In [10]:
adata = SCC0120_1_PBMC.concatenate(SCC0120_1_PBMC, batch_key = 'skin_condition', batch_categories = ['pbmc_1', 'pbmc_2'], join = 'inner')
adata

AnnData object with n_obs × n_vars = 1666 × 31908
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI', 'skin_condition'
    var: 'gene_ids-SCC0120_1_pbmc', 'feature_types-SCC0120_1_pbmc', 'mt-SCC0120_1_pbmc', 'ribo-SCC0120_1_pbmc', 'n_cells_by_counts-SCC0120_1_pbmc', 'mean_counts-SCC0

In [11]:
adata.obs['seed_labels'].value_counts()

seed_labels
T CD4 helper     614
T CD4 naive      390
T CD8 CTL        386
T CD8 CM         112
NK               100
T CD8 naive       38
Monocyte CD14     10
pDC               10
Monocyte CD16      6
Name: count, dtype: int64

### Select HVGs

In [12]:
adata_raw = adata.copy()
adata.layers['counts'] = adata.X.copy()

sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "donor",
    subset = True
)

adata

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)


AnnData object with n_obs × n_vars = 1666 × 7000
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI', 'skin_condition'
    var: 'gene_ids-SCC0120_1_pbmc', 'feature_types-SCC0120_1_pbmc', 'mt-SCC0120_1_pbmc', 'ribo-SCC0120_1_pbmc', 'n_cells_by_counts-SCC0120_1_pbmc', 'mean_counts-SCC01

### Transfer of annotation with scANVI

In [13]:
scvi.model.SCVI.setup_anndata(adata,
                        batch_key = 'donor',
                        labels_key = 'seed_labels',
                        categorical_covariate_keys = ['donor', 'batch'],                    
                        continuous_covariate_keys = ['n_genes', 'n_counts'], 
                        layer = 'counts')

In [14]:
scvi_model = scvi.model.SCVI(adata, 
                             n_latent = 50, 
                             n_layers = 3,
                             dispersion = 'gene-batch',
                             gene_likelihood = 'nb')

In [16]:
scvi_model.train(use_gpu = False)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


Epoch 400/400: 100%|██████████| 400/400 [07:38<00:00,  1.65s/it, v_num=1, train_loss_step=4.05e+3, train_loss_epoch=4.11e+3]

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 400/400: 100%|██████████| 400/400 [07:38<00:00,  1.15s/it, v_num=1, train_loss_step=4.05e+3, train_loss_epoch=4.11e+3]


### Label transfer with `scANVI` 

In [17]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, 'Unknown')

In [19]:
scanvi_model.train(use_gpu = False)

[34mINFO    [0m Training for [1;36m10[0m epochs.                                                                                   


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 8/10:  70%|███████   | 7/10 [00:05<00:02,  1.35it/s, v_num=1, train_loss_step=4.06e+3, train_loss_epoch=4.16e+3]

In [None]:
adata.obs["C_scANVI"] = scanvi_model.predict(adata)

- Extract latent representation

In [None]:
adata.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata)

- Visualise corrected dataset

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.3, spread = 4, random_state = 1712)
sc.pl.umap(adata, frameon = False, color = ['donor', 'condition', 'seed_labels', 'batch', 'C_scANVI'], size = 3, legend_fontsize = 5, ncols = 3)

### Export annotated object

In [None]:
adata_skin = anndata.AnnData(X = adata_raw.X, var = adata_raw.var, obs = adata.obs, obsm = adata.obsm) 
adata_skin

In [None]:
SCC0120_1_skin_annotated = adata_skin[adata_skin.obs['group'].isin(['SCC0120_1_skin'])]
SCC0120_1_skin_annotated