## Notebook for Smillie data integration and batch correction `scVI`

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 4th July 2023

### Load required modules

In [2]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import scipy as sp
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

import matplotlib.pyplot as plt

Global seed set to 0
  from .autonotebook import tqdm as notebook_tqdm
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [3]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [4]:
torch.cuda.is_available()

False

In [5]:
torch.set_float32_matmul_precision('medium')

In [6]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.4.0
absl                        NA
appnope                     0.1.2
asttokens                   NA
attr                        22.2.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
brotli                      NA
certifi                     2022.12.07
cffi                        1.15.1
charset_normalizer          2.1.1
chex                        0.1.6
colorama                    0.4.6
comm                        0.1.2
contextlib2                 NA
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
docrep                      0.3.2
entrypoints                 0.4
executing                   0.8.3
flax                        0.6.1
fsspec                      2023.3.0
h5py                        3.8.0
hypergeom_uf

In [7]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

In [8]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read in datasets

In [8]:
input = '/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_all_cells.h5ad'
adata = sc.read(input)

In [9]:
# Filter only tumor cells
adata = adata[adata.obs['sample.origin'] == 'Tumor']

In [10]:
X_is_raw(adata)

True

In [11]:
# Save raw data
adata.raw = adata

In [12]:
adata.layers['counts'] = adata.X.copy()

# Calculate 5000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = 'Library_Preparation_Protocol',
    subset = True,
    span = 1
)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)


In [13]:
adata.obs

Unnamed: 0,nFeature_RNA,pct_counts_mito,Sample_ID,Donor_ID,sample.origin,dataset_x,Cell_Type,dataset_y,Sex,Tumor Stage,...,iCMS,msi,batch,Age_group,Study_name,Diagnosis,n_genes_by_counts,total_counts,Library_Preparation_Protocol,total_counts_mito
CRC16_MUX8563_AAACCTGCAAGCCGCT-1-0,2577,0.0,MUX8563,CRC2794,Tumor,CRC-SG1,Plasma cells,CRC-SG1,Female,65.0,...,,,0,Adult,"Joanito, 2022","Colorectal cancer, Stage III",2577,29514.0,10x 5' v1,0.0
CRC16_MUX8563_AAACCTGTCTCGATGA-1-0,1531,0.0,MUX8563,CRC2794,Tumor,CRC-SG1,T cells,CRC-SG1,Female,65.0,...,,,0,Adult,"Joanito, 2022","Colorectal cancer, Stage III",1531,3259.0,10x 5' v1,0.0
CRC16_MUX8563_AAACCTGTCTCTGTCG-1-0,3083,0.0,MUX8563,CRC2794,Tumor,CRC-SG1,Plasma cells,CRC-SG1,Female,65.0,...,,,0,Adult,"Joanito, 2022","Colorectal cancer, Stage III",3083,71212.0,10x 5' v1,0.0
CRC16_MUX8563_AAACGGGCAAGTTAAG-1-0,1686,0.0,MUX8563,CRC2794,Tumor,CRC-SG1,T cells,CRC-SG1,Female,65.0,...,,,0,Adult,"Joanito, 2022","Colorectal cancer, Stage III",1686,3771.0,10x 5' v1,0.0
CRC16_MUX8563_AAACGGGCAGTTCATG-1-0,4304,0.0,MUX8563,CRC2794,Tumor,CRC-SG1,Mesenchymal,CRC-SG1,Female,65.0,...,,,0,Adult,"Joanito, 2022","Colorectal cancer, Stage III",4304,16948.0,10x 5' v1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KUL5_EXT127_GACTGCGAGTAGCGGT-1-1,5368,0.0,EXT127,SC044,Tumor,KUL5,Epithelial,KUL5,Female,80.0,...,iCMS3,MSI-H,1,Adult,"Joanito, 2022","Colorectal cancer, Stage III",5368,20711.0,10x 5' v1,0.0
KUL5_EXT127_GTGCATAGTTTGACAC-1-1,4430,0.0,EXT127,SC044,Tumor,KUL5,Epithelial,KUL5,Female,80.0,...,iCMS3,MSI-H,1,Adult,"Joanito, 2022","Colorectal cancer, Stage III",4430,11799.0,10x 5' v1,0.0
KUL5_EXT127_TATCAGGGTGTGAAAT-1-1,2291,0.0,EXT127,SC044,Tumor,KUL5,Epithelial,KUL5,Female,80.0,...,iCMS3,MSI-H,1,Adult,"Joanito, 2022","Colorectal cancer, Stage III",2291,5210.0,10x 5' v1,0.0
KUL5_EXT127_TCACAAGAGATCCCGC-1-1,4633,0.0,EXT127,SC044,Tumor,KUL5,Epithelial,KUL5,Female,80.0,...,iCMS3,MSI-H,1,Adult,"Joanito, 2022","Colorectal cancer, Stage III",4633,13969.0,10x 5' v1,0.0


In [14]:
adata.obs.rename(columns = {'Cell Type': 'Cell_Type'}, inplace = True)

### Run Integration with scVI

In [15]:
adata = adata.copy()
scvi.model.SCVI.setup_anndata(adata, 
                              layer = "counts", 
                              labels_key = "Cell_Type", 
                              categorical_covariate_keys = ["Sample_ID"])

In [16]:
scvi_model = scvi.model.SCVI(adata, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [17]:
scvi_model.train()

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 47/47: 100%|██████████| 47/47 [31:43<00:00, 40.54s/it, loss=1.04e+03, v_num=1]

`Trainer.fit` stopped: `max_epochs=47` reached.


Epoch 47/47: 100%|██████████| 47/47 [31:43<00:00, 40.49s/it, loss=1.04e+03, v_num=1]


In [18]:
adata.obsm["X_scVI"] = scvi_model.get_latent_representation()

### Integration with scANVI

In [19]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(
    scvi_model,
    adata=adata,
    labels_key="Cell_Type",
    unlabeled_category="Unknown",
)

In [20]:
scanvi_model.train()

[34mINFO    [0m Training for [1;36m10[0m epochs.                                                                                   


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 10/10: 100%|██████████| 10/10 [10:47<00:00, 64.92s/it, loss=1.14e+03, v_num=1]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 10/10: 100%|██████████| 10/10 [10:47<00:00, 64.79s/it, loss=1.14e+03, v_num=1]


In [21]:
adata.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata)

In [22]:
adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/Joanito_scVI_scANVI.h5ad')

### UMAP calculation

In [23]:
sc.pp.neighbors(adata, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')

computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:36)


In [24]:
sc.tl.umap(adata, min_dist = 0.4, spread = 4, random_state = 1712)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:50)


In [51]:
adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/Joanito_scVI_scANVI.h5ad')

In [26]:
adata.obs_keys

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 170596 × 5000
    obs: 'nFeature_RNA', 'pct_counts_mito', 'Sample_ID', 'Donor_ID', 'sample.origin', 'dataset_x', 'Cell_Type', 'dataset_y', 'Sex', 'Tumor Stage', 'MSS/MSI', 'Location', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'cell_ID', 'iCMS', 'msi', 'batch', 'Age_group', 'Study_name', 'Diagnosis', 'n_genes_by_counts', 'total_counts', 'Library_Preparation_Protocol', 'total_counts_mito', '_scvi_batch', '_scvi_labels'
    var: 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mito', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', '_scvi_uuid', '_scvi_manager_uuid', 'neighbors', 'umap'
    obsm: '_scvi_extra_categorical_covs', 'X_scVI', 'X_scANVI', 'X_umap'
    layers: 'count

In [None]:
adata.obs

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['Cell_Type', 'Diagnosis', 'Donor_ID', 'Location', 'Sex', 'Library_Preparation_Protocol'], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['MSS/MSI', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA'], size = 1, legend_fontsize = 5, ncols = 3)

In [47]:
adata.obs['predicted_doublets'] = adata.obs['predicted_doublets'].astype(str)

In [4]:
adata_with_mito = sc.read('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_tumor_cells.h5ad')

In [5]:
# Merge adata_with_mito.obs['percent.mt'] with adata.obs by index
adata.obs = adata.obs.merge(adata_with_mito.obs['percent.mt'], left_index = True, right_index = True)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['n_genes_by_counts', 'total_counts', 'percent.mt', 'predicted_doublets'], size = 1, legend_fontsize = 5, ncols = 4)

In [3]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/Joanito_scVI_scANVI.h5ad'
adata = sc.read(input)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, color=['n_genes_by_counts', 'total_counts', 'percent.mt', 'predicted_doublets'],
             color_map = "RdPu", size = 1, frameon = False, ncols=5)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, color=['Cell_Type'],
             color_map = "RdPu", size = 1, frameon = False, ncols=5)