### Notebook for the label transfer of Healthy PBMCs to study PBMC using `scANVI`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- v230703

### Import required modules

In [1]:
import scvi
import anndata
import warnings
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

  self.seed = seed
  self.dl_pin_memory_gpu_training = (
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                 9.4.0
absl                NA
aiohttp             3.8.4
aiosignal           1.3.1
anyio               NA
appnope             0.1.3
asttokens           NA
async_timeout       4.0.2
attr                23.1.0
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
brotli              NA
bs4                 4.12.2
certifi             2022.12.07
cffi                1.15.1
charset_normalizer  2.1.1
chex                0.1.6
click               8.1.3
colorama            0.4.6
contextlib2         NA
croniter            NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
deepdiff            6.3.0
docrep              0.3.2
executing           1.2.0
fastapi             0.99.1
flax                0.5.0
frozenlist          1.3.3
fsspec              2023.3.0
h5py                3.8.0
hypergeom_ufunc     NA
idna                3.4
igraph   

In [3]:
warnings.simplefilter(action = 'ignore')
scvi.settings.seed = 1712
%config InlineBackend.print_figure_kwargs = {'facecolor' : "w"}
%config InlineBackend.figure_format = 'retina'

Global seed set to 1712


In [4]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

### Read in Healthy data

In [5]:
healthy_raw = sc.read_h5ad('../data/meyer_nikolic_healthy_pbmc_raw.h5ad')
healthy_raw

AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states'
    var: 'name'

- Remove annotations with less than 10 cells

In [6]:
healthy_raw.obs['seed_labels'].value_counts()

T CD4 naive               32672
Monocyte CD14             20464
B naive                   19295
NK                        19085
T CD8 naive               16140
T CD4 helper              13552
T CD8 CTL                  9541
T CD8 CM                   5544
Monocyte CD16              4457
T reg                      3251
T g/d                      3183
B n-sw mem                 2993
Monocyte CD14 IFN stim     2559
NK CD56                    2353
MAIT                       2213
B sw mem                   2068
T CD4 naive IFN stim       1860
T CD8 EMRA                 1834
cDC2                       1371
T CD4 CTL                  1331
Cycling                    1012
B invar                     869
T CD8 EM                    795
B naive IFN stim            745
pDC                         706
Platelets                   626
Monocyte CD16 IFN stim      616
Monocyte CD16+C1            464
NK IFN stim                 433
HPC                         414
Plasma cells                305
NKT     

In [9]:
healthy_filtered = healthy_raw[~healthy_raw.obs['seed_labels'].isin(['HPC IFN stim'])]
healthy_filtered

View of AnnData object with n_obs × n_vars = 173682 × 33559
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states'
    var: 'name'

In [10]:
healthy_filtered.obs['seed_labels'].value_counts()

T CD4 naive               32672
Monocyte CD14             20464
B naive                   19295
NK                        19085
T CD8 naive               16140
T CD4 helper              13552
T CD8 CTL                  9541
T CD8 CM                   5544
Monocyte CD16              4457
T reg                      3251
T g/d                      3183
B n-sw mem                 2993
Monocyte CD14 IFN stim     2559
NK CD56                    2353
MAIT                       2213
B sw mem                   2068
T CD4 naive IFN stim       1860
T CD8 EMRA                 1834
cDC2                       1371
T CD4 CTL                  1331
Cycling                    1012
B invar                     869
T CD8 EM                    795
B naive IFN stim            745
pDC                         706
Platelets                   626
Monocyte CD16 IFN stim      616
Monocyte CD16+C1            464
NK IFN stim                 433
HPC                         414
Plasma cells                305
NKT     

### Read in other unannotated datasets

In [11]:
SCC0120_1 = sc.read_h5ad('../data/SCC0120_1_QCed_pre-processed_ctl230701.h5ad')
SCC0120_1.obs['tissue'].cat.categories

Index(['pbmc', 'skin'], dtype='object')

In [15]:
SCC0120_1

AnnData object with n_obs × n_vars = 8839 × 36611
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'predicted_doublets', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score'
    var: 'gene_ids', 'feature_types', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'batch_colors', 'tissue_colors'
    layers: 'counts', 'sqrt_norm'

In [16]:
SCC0120_1_pbmc = SCC0120_1[SCC0120_1.obs['tissue'].isin(['pbmc'])]
SCC0120_1_pbmc.obs['seed_labels'] = 'Unknown'
SCC0120_1_pbmc.obs['donor'] = SCC0120_1_pbmc.obs['sample'].copy()
SCC0120_1_pbmc

AnnData object with n_obs × n_vars = 1360 × 36611
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'predicted_doublets', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score', 'seed_labels', 'donor'
    var: 'gene_ids', 'feature_types', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'batch_colors', 'tissue_colors'
    layers: 'counts', 'sqrt_norm'

In [17]:
adata = healthy_filtered.concatenate(SCC0120_1_pbmc, batch_key = 'group', batch_categories = ['healthy_pbmc', 'SCC0120_1_pbmc'], join = 'inner')
adata

AnnData object with n_obs × n_vars = 175042 × 31908
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'tissue', 'condition', 'n_genes', 'doublet_scores', 'predicted_doublets', 'hashtag', 'unique', 'group', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'S_score', 'G2M_score'
    var: 'gene_ids-SCC0120_1_pbmc', 'feature_types-SCC0120_1_pbmc', 'mt-SCC0120_1_pbmc', 'ribo-SCC0120_1_pbmc', 'n_cells_by_counts-SCC0120_1_pbmc', 'mean_counts-SCC0120_1_pbmc', 'pct_dropout_by_counts-S

### Transfer of annotation with scANVI

In [18]:
scvi.model.SCVI.setup_anndata(adata, batch_key = 'donor', labels_key = "seed_labels")

In [19]:
scvi_model = scvi.model.SCVI(adata, n_latent = 50, n_layers = 3)

In [20]:
scvi_model.train()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 1/46:   0%|          | 0/46 [00:00<?, ?it/s]

### Label transfer with `scANVI` 

In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, 'Unknown')

In [None]:
scanvi_model.train()

In [None]:
adata.obs["C_scANVI"] = scanvi_model.predict(adata)

- Extract latent representation

In [None]:
adata.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata)

- Visualise corrected dataset

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.3, spread = 4, random_state = 1712)
sc.pl.umap(adata, frameon = False, color = ['seed_labels', 'donor', 'cell_states', 'condition', 'group', 'batch',], size = 1, legend_fontsize = 5, ncols = 3)