## Notebook for Epithelial labels transfer and datasets integration 

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 15th May 2023

### Install packages

In [None]:
!pip install scarches scvi-tools scib scib_metrics scvi_colab faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scarches
  Downloading scArches-0.5.8-py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.6/120.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scvi-tools
  Downloading scvi_tools-0.20.3-py3-none-any.whl (330 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.3/330.3 kB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scib
  Downloading scib-1.1.3-1-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.0/79.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scib_metrics
  Downloading scib_metrics-0.3.3-py3-none-any.whl (35 kB)
Collecting scvi_colab
  Downloading scvi_colab-0.12.0-py3-none-any.whl (4.2 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━

### Import packages

In [None]:
import scanpy as sc
import torch
import scarches as sca
import numpy as np
import gdown
import pandas as pd

 captum (see https://github.com/pytorch/captum).
INFO:lightning_fabric.utilities.seed:Global seed set to 0


In [None]:
import warnings
warnings.simplefilter(action='ignore')

In [None]:
sc.set_figure_params(frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7)

### Data upload

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
input_cancer = '/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/input_files/Epithelial_Colon_cancer_atlas_normalized.h5ad'
adata_cancer = sc.read_h5ad(input_cancer)

input_healthy = '/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/input_files/Epithelial_Healthy_anndata_normalized.h5ad'
adata_healthy = sc.read_h5ad(input_healthy)

### Extract Highly Variable Genes

In [None]:
adata_healthy.layers['raw_counts'] = adata_healthy.X.copy()

In [None]:
### HVGs selection
# Calculate HVGs for cancer dataset
sc.pp.highly_variable_genes(
    adata_healthy,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "raw_counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [None]:
# Extract same HVGs in the cancer dataset as in the healthy dataset

#Make indexes as string
adata_cancer.var.index = adata_cancer.var.index.astype(str)

# Ensure indexes are unique
adata_cancer.var_names_make_unique()

# Identify common genes
common_genes = list(set(adata_healthy.var_names) & set(adata_cancer.var_names))

# Filter genes
adata_healthy = adata_healthy[:, common_genes]
adata_cancer = adata_cancer[:, common_genes]

#Ensure the same order of the genes
adata_cancer = adata_cancer[:, adata_healthy.var_names]

### Create expiMap model and train it on reference dataset

In [None]:
adata_healthy.obs

Unnamed: 0_level_0,Sample_ID,Cell Type,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,Library_Preparation_Protocol,...,dataset,n_genes_by_counts,total_counts,total_counts_mito,pct_counts_mito,total_counts_ribo,pct_counts_ribo,Cell_ID,_scvi_batch,_scvi_labels
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACGAGGATCAGTGTTG-1-4918STDY7421298,BRC2043_10.2Wk_FTIL_SC-EPCAMP,Epithelial,Gut Cell Atlas,BRC2043,Fetal Healthy,10.2Wk,FTIL,SC-EPCAMP,Male,3',...,reference,1513,4993.0,164.0,3.284599,1969.0,39.435207,ACGAGGATCAGTGTTG-1-4918STDY7421298,0,2
H158108_N1-GTTAAGCAGAGGTAGA,H158108_N1,Epithelial,Kong 2023,158108,Healthy adult,,,,Male,10x 3' v2,...,query,806,2072.0,15.0,0.723938,185.0,8.928572,H158108_N1-GTTAAGCAGAGGTAGA,0,2
H180844_N1-AACTCAGTCAAGATCC,H180844_N1,Epithelial,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,...,query,499,872.0,119.0,13.646789,213.0,24.426605,H180844_N1-AACTCAGTCAAGATCC,0,2
H180844_N4-CGCTTCAGTAGGCATG,H180844_N4,Epithelial,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,...,query,768,1814.0,243.0,13.395810,39.0,2.149945,H180844_N4-CGCTTCAGTAGGCATG,0,2
GACGCGTTCCTCAACC-1-WTDAtest7844018,A33-CAE-0-SC-45N-1,Epithelial,Gut Cell Atlas,A33 (414C),Healthy adult,20-25,CAE,SC-45N,Male,3',...,reference,743,1516.0,109.0,7.189973,401.0,26.451189,GACGCGTTCCTCAACC-1-WTDAtest7844018,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATCGAGTTCGGCTACG-1-WTDAtest7770719,A30-SCL-6-SC-45N-2,Epithelial,Gut Cell Atlas,A30 (398B),Healthy adult,20-25,SCL,SC-45N,Female,3',...,reference,3216,16628.0,1578.0,9.490017,5333.0,32.072407,ATCGAGTTCGGCTACG-1-WTDAtest7770719,0,2
H180844_N1-ATAGACCTCTAACTGG,H180844_N1,Epithelial,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,...,query,379,752.0,79.0,10.505320,207.0,27.526596,H180844_N1-ATAGACCTCTAACTGG,0,2
GGAACTTTCTTTACGT-1-4918STDY7421297,BRC2043_10.2Wk_FPIL_SC-EPCAMP,Epithelial,Gut Cell Atlas,BRC2043,Fetal Healthy,10.2Wk,FPIL,SC-EPCAMP,Male,3',...,reference,1156,3803.0,206.0,5.416776,1975.0,51.932686,GGAACTTTCTTTACGT-1-4918STDY7421297,0,2
H180844_N4-TTTGCGCCATGCGCAC,H180844_N4,Epithelial,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,...,query,407,807.0,132.0,16.356878,49.0,6.071871,H180844_N4-TTTGCGCCATGCGCAC,0,2


In [None]:
# Create a mask with all ones (assuming all genes are equally important)
adata_healthy.varm['mask'] = np.ones((adata_healthy.n_vars, 1))

In [None]:
intr_cvae = sca.models.EXPIMAP(
    adata = adata_healthy,
    condition_key='Sample_ID',
    hidden_layer_sizes=[256, 256, 256],
    recon_loss='nb',
    mask_key='mask'
)


INITIALIZING NEW NETWORK..............
Encoder Architecture:
	Input Layer in, out and cond: 4655 256 296
	Hidden Layer 1 in/out: 256 256
	Hidden Layer 2 in/out: 256 256
	Mean/Var Layer in/out: 256 1
Decoder Architecture:
	Masked linear layer in, ext_m, ext, cond, out:  1 0 0 296 4655
	with hard mask.
Last Decoder layer: softmax


In [None]:
ALPHA = 0.7

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
intr_cvae.model.cuda()

expiMap(
  (encoder): ExtEncoder(
    (FC): Sequential(
      (L0): MaskedCondLayers(
        (expr_L): Linear(in_features=4655, out_features=256, bias=True)
        (cond_L): Linear(in_features=296, out_features=256, bias=False)
      )
      (N0): LayerNorm((256,), eps=1e-05, elementwise_affine=False)
      (A0): ReLU()
      (D0): Dropout(p=0.05, inplace=False)
      (L1): Linear(in_features=256, out_features=256, bias=True)
      (N1): LayerNorm((256,), eps=1e-05, elementwise_affine=False)
      (A1): ReLU()
      (D1): Dropout(p=0.05, inplace=False)
      (L2): Linear(in_features=256, out_features=256, bias=True)
      (N2): LayerNorm((256,), eps=1e-05, elementwise_affine=False)
      (A2): ReLU()
      (D2): Dropout(p=0.05, inplace=False)
    )
    (mean_encoder): Linear(in_features=256, out_features=1, bias=True)
    (log_var_encoder): Linear(in_features=256, out_features=1, bias=True)
  )
  (decoder): MaskedLinearDecoder(
    (L0): MaskedCondLayers(
      (expr_L): MaskedLinear

In [None]:
early_stopping_kwargs = {
    "early_stopping_metric": "val_unweighted_loss", # val_unweighted_loss
    "threshold": 0,
    "patience": 50,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}
intr_cvae.train(
    n_epochs=200,
    alpha_epoch_anneal=100,
    alpha=ALPHA,
    alpha_kl=0.5,
    weight_decay=0.,
    early_stopping_kwargs=early_stopping_kwargs,
    use_early_stopping=True,
    monitor_only_val=False,
    seed=2020,
    print_stats=True,
    use_gpu = True,
)

Init the group lasso proximal operator for the main terms.
Number of deactivated terms: 0
-------------------
 |--------------------| 0.5%  - epoch_loss: 827.3853559617 - epoch_recon_loss: 827.3853559617 - epoch_kl_loss: 9.0667736573 - val_loss: 712.1950932945 - val_recon_loss: 712.1950932945 - val_kl_loss: 16.5384059941
Number of deactivated terms: 0
-------------------
 |--------------------| 1.0%  - epoch_loss: 712.4422618564 - epoch_recon_loss: 712.3488878066 - epoch_kl_loss: 18.6747933543 - val_loss: 708.5985717773 - val_recon_loss: 708.4958566805 - val_kl_loss: 20.5425506220
Number of deactivated terms: 0
-------------------
 |--------------------| 1.5%  - epoch_loss: 709.9705946318 - epoch_recon_loss: 709.7636091066 - epoch_kl_loss: 20.6984945972 - val_loss: 708.2130160448 - val_recon_loss: 707.9980491080 - val_kl_loss: 21.4967727894
Number of deactivated terms: 0
-------------------
 |--------------------| 2.0%  - epoch_loss: 702.9562443133 - epoch_recon_loss: 702.6336147471 - 

In [None]:
MEAN = False

In [None]:
adata_healthy.obsm['expimap_X_cvae'] = intr_cvae.get_latent(mean=MEAN, only_active=True)

### Initlizling the model for query training

In [None]:
#adata_cancer.X = adata_cancer.X.todense()

In [None]:
q_intr_cvae = sca.models.EXPIMAP.load_query_data(adata_cancer, intr_cvae)

View of AnnData object with n_obs × n_vars = 113593 × 4655
    obs: 'Sample_ID', 'Donor_ID', 'SpecimenType', 'TissueSource', 'ProcessingMethod', 'PatientTypeID', 'Gender', 'Site', 'Grade', 'TumorStage', 'LymphNodeStatus', 'MMRStatusTumor', 'MMRMLH1Tumor', 'Diagnosis', 'organ__ontology_label', 'Library_Preparation_Protocol', 'ClusterFull', 'ClusterMidway', 'Cell Type', 'Study_name', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_id', 'gene_name', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo'
    uns: 'log1p'

INITIALIZING NEW NETWORK..............
Encoder Architecture:
	Input Layer in, out and cond: 4655 256 425
	Hidden Layer 1 in/out: 256 256
	Hidden Layer 2 in/out: 256 256
	Mean/Var Layer in/out: 256 1
Decoder Architecture:
	Masked linear layer in, ext_m, ext, cond, out:  1 0 0 425 4655
	with hard mask.
Last Decode

In [None]:
q_intr_cvae.model.cuda()

expiMap(
  (encoder): ExtEncoder(
    (FC): Sequential(
      (L0): MaskedCondLayers(
        (expr_L): Linear(in_features=4655, out_features=256, bias=True)
        (cond_L): Linear(in_features=425, out_features=256, bias=False)
      )
      (N0): LayerNorm((256,), eps=1e-05, elementwise_affine=False)
      (A0): ReLU()
      (L1): Linear(in_features=256, out_features=256, bias=True)
      (N1): LayerNorm((256,), eps=1e-05, elementwise_affine=False)
      (A1): ReLU()
      (L2): Linear(in_features=256, out_features=256, bias=True)
      (N2): LayerNorm((256,), eps=1e-05, elementwise_affine=False)
      (A2): ReLU()
    )
    (mean_encoder): Linear(in_features=256, out_features=1, bias=True)
    (log_var_encoder): Linear(in_features=256, out_features=1, bias=True)
  )
  (decoder): MaskedLinearDecoder(
    (L0): MaskedCondLayers(
      (expr_L): MaskedLinear(in_features=1, out_features=4655, bias=False)
      (cond_L): Linear(in_features=425, out_features=4655, bias=False)
    )
    (

In [None]:
q_intr_cvae.train(n_epochs=200, alpha_epoch_anneal=100, weight_decay=0., alpha_kl=0.1, seed=2020, use_early_stopping=True, print_stats=True)

In [None]:
adata_cancer.obsm['expimap_X_cvae'] = q_intr_cvae.get_latent(mean=MEAN, only_active=True)

### Transfer labels from latent embedding to obs

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Use the embeddings and labels from reference dataset to train the classifier
X_train = adata_healthy.obsm['expimap_X_cvae']
y_train = adata_healthy.obs['Cell States']

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Use the trained classifier to predict labels for query dataset
X_query = adata_cancer.obsm['expimap_X_cvae']
predicted_labels = knn.predict(X_query)

# Store these predicted labels in obs dataframe of query adata
adata_cancer.obs['Predicted Cell States'] = predicted_labels

In [None]:
adata_cancer.obs['Predicted Cell States'].value_counts()

### Get latent representation of reference + query dataset

In [None]:
adata = sc.AnnData.concatenate(adata_healthy, adata_cancer, batch_key='batch_join', uns_merge='same')

In [None]:
adata.obsm['expimap_X_cvae'] = q_intr_cvae.get_latent(adata.X, adata.obs['Predicted Cell States'], mean=MEAN, only_active=True)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
q_intr_cvae.save('/content/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/output_files/cancer_model')

In [None]:
adata.write('/content/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/output_files/Epithelial_healthy_and_cancer_integrated_andata.h5ad')

### Read data


In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

input_path = '/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/output_files/Epithelial_healthy_and_cancer_integrated_andata.h5ad'

In [None]:
adata = sc.read_h5ad(input_path)

### Run UMAP

In [None]:
sc.pp.neighbors(adata, use_rep='X_cvae')
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['seed_labels', 'Cell States', 'Study_name', 'Donor_ID', 'Diagnosis', 'Location', 'Gender', 'Library_Preparation_Protocol'], frameon=False, wspace=0.6)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
adata.write('/content/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/output_files/Epithelial_healthy_and_cancer_integrated_andata_with_umap.h5ad')

### scIB metrics calculation

In [None]:
from rich import print
import scib
import scib.metrics

In [None]:
from scvi_colab import install
from scib_metrics.benchmark import Benchmarker

In [None]:
import faiss

from scib_metrics.nearest_neighbors import NeighborsOutput


def faiss_hnsw_nn(X: np.ndarray, k: int):
    """Gpu HNSW nearest neighbor search using faiss.

    See https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
    for index param details.
    """
    X = np.ascontiguousarray(X, dtype=np.float32)
    res = faiss.StandardGpuResources()
    M = 32
    index = faiss.IndexHNSWFlat(X.shape[1], M, faiss.METRIC_L2)
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    gpu_index.add(X)
    distances, indices = gpu_index.search(X, k)
    del index
    del gpu_index
    # distances are squared
    return NeighborsOutput(indices=indices, distances=np.sqrt(distances))


def faiss_brute_force_nn(X: np.ndarray, k: int):
    """Gpu brute force nearest neighbor search using faiss."""
    X = np.ascontiguousarray(X, dtype=np.float32)
    res = faiss.StandardGpuResources()
    index = faiss.IndexFlatL2(X.shape[1])
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    gpu_index.add(X)
    distances, indices = gpu_index.search(X, k)
    del index
    del gpu_index
    # distances are squared
    return NeighborsOutput(indices=indices, distances=np.sqrt(distances))

In [None]:
bm = Benchmarker(
    adata,
    batch_key="Sample_ID",
    label_key="Cell States",
    embedding_obsm_keys=["X_pca", 'X_cvae'],
    n_jobs=-1
)
bm.prepare(neighbor_computer=faiss_brute_force_nn)
bm.benchmark()

In [None]:
bm.plot_results_table(min_max_scale=False)