## Notebook for Joanito cancer epithelial and healthy epithelial datasets integration

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 31st May 2023

### Import packages

In [1]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import scipy as sp
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

import matplotlib.pyplot as plt

from scvi.model.utils import mde 

Global seed set to 0
  from .autonotebook import tqdm as notebook_tqdm
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [2]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [3]:
torch.cuda.is_available()

False

In [4]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.4.0
absl                        NA
appnope                     0.1.2
asttokens                   NA
attr                        22.2.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
brotli                      NA
certifi                     2022.12.07
cffi                        1.15.1
charset_normalizer          2.1.1
chex                        0.1.6
colorama                    0.4.6
comm                        0.1.2
contextlib2                 NA
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
docrep                      0.3.2
entrypoints                 0.4
executing                   0.8.3
flax                        0.6.1
fsspec                      2023.3.0
h5py                        3.8.0
hypergeom_uf

In [5]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

In [6]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Load Data

In [7]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Datasets integration/Epithelial_healthy_cancer/All_cells_5000_HVGs.h5ad'
adata = sc.read_h5ad(input)

In [8]:
X_is_raw(adata)

True

In [9]:
# Shuffle the data
adata = adata[adata.obs.sample(frac=1).index]

### Integration with scVI

In [10]:
adata

View of AnnData object with n_obs × n_vars = 191909 × 5000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'Unified_Cell_States', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'nFeature_RNA', 'sample.origin', 'dataset_x', 'iCMS', 'msi', 'dataset_y', 'Tumor Stage', 'MSS/MSI', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'Sample origin'
    var: 'feature_types-Colorectal cancer', 'genome-Colorectal cancer', 'feature_types-0-0-0-Healthy gut', 'gene_name-1-0-0-Healthy gut', 'gene_id-0-0-Healthy gut', 'GE

In [13]:
adata = adata.copy()
scvi.model.SCVI.setup_anndata(adata, 
                              layer = "counts", 
                              labels_key = "Unified_Cell_States", 
                              categorical_covariate_keys = ["Sample_ID", "Donor_ID"],
                              continuous_covariate_keys=["n_genes_by_counts", "pct_counts_mito"])

In [14]:
scvi_model = scvi.model.SCVI(adata, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [15]:
scvi_model.train()

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 42/42: 100%|██████████| 42/42 [33:34<00:00, 47.09s/it, loss=805, v_num=1]

`Trainer.fit` stopped: `max_epochs=42` reached.


Epoch 42/42: 100%|██████████| 42/42 [33:34<00:00, 47.96s/it, loss=805, v_num=1]


In [16]:
adata.obsm["X_scVI"] = scvi_model.get_latent_representation()

### Integration with scANVI

In [17]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(
    scvi_model,
    adata=adata,
    labels_key="Unified_Cell_States",
    unlabeled_category="Unknown",
)

In [18]:
scanvi_model.train()

[34mINFO    [0m Training for [1;36m10[0m epochs.                                                                                   


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 10/10: 100%|██████████| 10/10 [12:01<00:00, 72.59s/it, loss=881, v_num=1]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 10/10: 100%|██████████| 10/10 [12:01<00:00, 72.20s/it, loss=881, v_num=1]


In [19]:
adata.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata)

In [26]:
# Save the results
adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Datasets integration/Epithelial_healthy_cancer/Integrated_epithelial_healthy_and_cancer.h5ad')

In [7]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Datasets integration/Epithelial_healthy_cancer/Integrated_epithelial_healthy_and_cancer.h5ad'
adata = sc.read_h5ad(input)

In [14]:
def replace_location_nan(adata):
    # Make a copy of the dataframe
    df = adata.obs.copy()

    # Find the rows where Location is 'nan'
    nan_locs = df[df['Location'] == 'nan'].index

    # Extract the first part of the Sample_ID (before the '-')
    new_locs = df.loc[nan_locs, 'Sample_ID'].str.split('-', expand=True)[0]

    # Add new locations to the categories of 'Location' column if they are not present
    for loc in new_locs.unique():
        if loc not in df['Location'].cat.categories:
            df['Location'].cat.add_categories([loc], inplace=True)

    # Replace 'nan' values in Location with the new locations
    df.loc[nan_locs, 'Location'] = new_locs

    # Replace the obs in the original AnnData object
    adata.obs = df

    return adata

# Use the function
adata = replace_location_nan(adata)


In [16]:
def simplify_locations(adata):
    # Create a dictionary to map detailed locations to broader categories
    location_map = {
        'Large Intestine': 'Intestine',
        'Terminal Ileum': 'Ileum',
        'Rectum': 'Rectum',
        'Small Intestine': 'Intestine',
        'Sigmoid colon': 'Colon',
        'Ascending colon': 'Colon',
        'Caecum': 'Caecum',
        'APD': 'APD',
        'Hepatic Flexure': 'Colon',
        'Epithelium': 'Epithelium',
        'Rectosigmoid': 'Rectum',
        'Low rectum': 'Rectum',
        'Upper rectum': 'Rectum',
        'Lamina Propria': 'Other',
        'Distal Ascending colon': 'Colon',
        'Mid-rectum': 'Rectum',
        'Transverse colon': 'Colon',
        'Descending colon': 'Colon',
        'Ileum': 'Ileum',
        'Distal Sigmoid colon': 'Colon',
        'Colon': 'Colon',
        'Distal Descending colon': 'Colon'
    }
    
    # Apply the mapping to the 'Location' column
    adata.obs['Location'] = adata.obs['Location'].map(location_map)
    
    return adata

adata = simplify_locations(adata)


In [19]:
sc.pp.neighbors(adata, use_rep='X_scANVI')

computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:16)


In [20]:
sc.tl.umap(adata, min_dist = 0.4, spread = 4, random_state = 1712)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:36)


In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, color=["Study_name", 'Unified_Cell_States', 'Diagnosis','Location', 'Library_Preparation_Protocol', 'Sex', 'n_genes_by_counts', 'total_counts', 'pct_counts_mito', 'doublet_info'], color_map = "magma", size = 2.5, frameon = False, legend_fontsize = 9, ncols = 3)