### Notebook for the integration benchmark with the `scIB` package for Integrated Gut Dataset

- **Developed by:** Anna Maguza
- **Institute of Computational Biology - Computational Health Department - Helmholtz Munich**
- 26th April 2022

In [1]:
import anndata as an
import scanpy as sc
from rich import print
import scib
import scib.metrics

In [2]:
from scvi_colab import install
from scib_metrics.benchmark import Benchmarker

In [3]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.9.1
scanpy      1.9.3
-----
PIL                         9.5.0
absl                        NA
appnope                     0.1.2
asttokens                   NA
backcall                    0.2.0
chex                        0.1.7
comm                        0.1.2
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
deprecated                  1.2.13
executing                   0.8.3
h5py                        3.8.0
igraph                      0.10.4
importlib_resources         NA
ipykernel                   6.19.2
jax                         0.4.8
jaxlib                      0.4.7
jedi                        0.18.1
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.39.1
matplotlib                  3.7.1
ml_dtypes                   0.1.0
mpl_toolkits                NA
natsort  

  IPython.display.set_matplotlib_formats(*ipython_format)


### 3000 Highly Variable Genes

In [4]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_3000_HVGs_scvi_scanvi.h5ad'
output = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_3000_HVGs_scvi_scanvi_metrics.h5ad'
adata = sc.read_h5ad(input)

In [5]:
adata


AnnData object with n_obs × n_vars = 557099 × 3000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'hvg'
    obsm: 'X_mde', 'X_mde_scanvi', 'X_scANVI', 'X_scVI', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
    layers: 'counts'

In [6]:
bm = Benchmarker(
    adata,
    batch_key="Sample_ID",
    label_key="Cell Type",
    embedding_obsm_keys=["X_pca", 'X_scANVI', 'X_scVI'],
    n_jobs=-1,
)
bm.benchmark()

computing PCA
    with n_comps=50
    finished (0:00:37)


  from .autonotebook import tqdm as notebook_tqdm
Computing neighbors: 100%|██████████| 3/3 [23:09<00:00, 463.09s/it]
Embeddings:   0%|[32m          [0m| 0/3 [00:00<?, ?it/s]

: 

: 

In [None]:
bm.plot_results_table(min_max_scale=False)