### Notebook for the generation of an integrated manifold with `scANVI`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- v230811

### Import required modules

In [2]:
import torch
import scvi
import anndata
import warnings
import numpy as np
import scanpy as sc
import pandas as pd
from geosketch import gs
import matplotlib.pyplot as plt
from sklearn.utils import check_random_state
from scib_metrics.benchmark import Benchmarker

### Set up working environment

In [3]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                 9.4.0
absl                NA
aiohttp             3.8.4
aiosignal           1.3.1
anyio               NA
appnope             0.1.3
asttokens           NA
async_timeout       4.0.2
attr                23.1.0
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
brotli              NA
bs4                 4.12.2
certifi             2022.12.07
cffi                1.15.1
charset_normalizer  2.1.1
chex                0.1.6
click               8.1.3
colorama            0.4.6
contextlib2         NA
croniter            NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
deepdiff            6.3.0
docrep              0.3.2
executing           1.2.0
fastapi             0.99.1
fbpca               NA
flax                0.5.0
frozenlist          1.3.3
fsspec              2023.3.0
geosketch           1.2
h5py                3.8.0
hypergeom

In [4]:
warnings.simplefilter(action = 'ignore')
scvi.settings.seed = 1712
%config InlineBackend.print_figure_kwargs = {'facecolor' : "w"}
%config InlineBackend.figure_format = 'retina'
torch.set_float32_matmul_precision('medium')

Global seed set to 1712


In [5]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 3,
)

### Read in Healthy data

In [6]:
adata_raw = sc.read_h5ad('../data/Marburg_cell_states_locked_ctl230811.raw.h5ad')
adata_raw

AnnData object with n_obs × n_vars = 97573 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'cell_compartment', 'seed_labels'
    var: 'mt', 'ribo'

### Select randomly 20K cells to use as test set for `scANVI`

In [9]:
adata_subset = sc.pp.subsample(adata_raw, n_obs = 20000, random_state = 1712, copy = True)
adata_subset.obs['cell_compartment'] = 'Unknown'
adata_subset

AnnData object with n_obs × n_vars = 20000 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'cell_compartment', 'seed_labels'
    var: 'mt', 'ribo'

In [11]:
adata_raw.obs['cell_compartment'] = adata_raw.obs['cell_compartment'].astype(str)
adata_subset.obs['cell_compartment'] = adata_subset.obs['cell_compartment'].astype(str)

subset_compartment_values = adata_subset.obs['cell_compartment']
adata_raw.obs.loc[adata_subset.obs.index, 'cell_compartment'] = subset_compartment_values
adata_raw.obs['cell_compartment'] = pd.Categorical(adata_raw.obs['cell_compartment'])

adata_raw.obs['cell_compartment'].value_counts()

Basal           38873
Goblet          22514
Unknown         20000
Club             7117
Ciliated         2956
Secretory        2492
SupraB           1581
Epi              1157
Ionocyte          506
Deuterosomal      377
Name: cell_compartment, dtype: int64

### Select HVGs

In [12]:
raw_adata = adata_raw.copy()
adata_raw.layers['counts'] = adata_raw.X.copy()

sc.pp.highly_variable_genes(
    adata_raw,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "donor",
    subset = True
)
adata_raw

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)


AnnData object with n_obs × n_vars = 97573 × 7000
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'cell_compartment', 'seed_labels'
    var: 'mt', 'ribo', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg'
    layers: 'counts'

### Transfer of annotation with scANVI

In [13]:
scvi.model.SCVI.setup_anndata(adata_raw, 
categorical_covariate_keys = ["illumina_stimunr", "donor", "sample_group"], 
labels_key = "seed_labels", 
layer = 'counts')

In [15]:
scvi_model = scvi.model.SCVI(adata_raw, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [16]:
scvi_model.train()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 2/82:   1%|          | 1/82 [00:57<1:17:06, 57.12s/it, v_num=1, train_loss_step=3.41e+3, train_loss_epoch=3.45e+3]

### Label transfer with `scANVI` 

In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, 'Unknown')

In [None]:
scanvi_model.train()

In [None]:
adata_raw.obs["C_scANVI"] = scanvi_model.predict(adata_raw)

- Extract latent representation

In [None]:
adata_raw.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata_raw)

- Visualise corrected dataset

In [None]:
sc.pp.neighbors(adata_raw, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata_raw, min_dist = 0.3, spread = 4, random_state = 1712)
sc.pl.umap(adata_raw, frameon = False, color = ['group', 'disease', 'infection', 'C_scANVI', 'seed_labels', 'donor', 'bd_rhapsody', 'illumina_stimunr'], size = 1, legend_fontsize = 5, ncols = 4)

In [None]:
sc.pl.umap(adata, frameon = False, color = ['SMK', 'n_genes', 'doublet_scores', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'n_counts', 'sample_group', 'IAV_score'], size = 1, legend_fontsize = 5, ncols = 4, cmap = 'plasma')

### Compute integration metrics

In [None]:
bm = Benchmarker(
    adata,
    batch_key = ["illumina_stimunr", "donor", "sample_group"],
    label_key = "C_scANVI",
    embedding_obsm_keys = ["X_pca", "X_scVI", "X_scANVI"],
    n_jobs = -1,
)
bm.benchmark()

In [None]:
bm.plot_results_table(min_max_scale = False)

### Export annotated sample object 

In [None]:
adata.obs.index = pd.Index(['-'.join(idx.split('-')[:3]) for idx in adata.obs.index])
adata.obs.index

In [None]:
adata_raw.obs.index = pd.Index(['-'.join(idx.split('-')[:3]) for idx in adata_raw.obs.index])
adata_raw.obs.index

In [None]:
adata.obs_names

In [None]:
adata.obs['C_scANVI'].cat.categories

In [None]:
adata.obs['C_scANVI'].value_counts()

### Export annotated object with raw counts

In [None]:
adata

In [None]:
adata_raw

In [None]:
adata_export = anndata.AnnData(X = adata_raw.X, obs = adata.obs, var = adata_raw.var)
adata_export.obsm['X_scVI'] = adata.obsm['X_scVI'].copy()
adata_export.obsm['X_umap'] = adata.obsm['X_umap'].copy()
adata_export.obsm['X_scANVI'] = adata.obsm['X_scANVI'].copy()
adata_export

In [None]:
adata_export.write('../data/Marburg_All_ctl230330_scANVI_annot.raw.h5ad')