### Notebook for new gene programs identification in Cancer epithelial cells with *expimap*
**Developed by:** Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**  
**31st May 2023**

### Import packages

In [1]:
!pip install scarches scvi-tools scib scib_metrics scvi_colab faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scarches
  Downloading scArches-0.5.9-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.5/128.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scvi-tools
  Downloading scvi_tools-0.20.3-py3-none-any.whl (330 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.3/330.3 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scib
  Downloading scib-1.1.3-1-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.0/79.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scib_metrics
  Downloading scib_metrics-0.3.3-py3-none-any.whl (35 kB)
Collecting scvi_colab
  Downloading scvi_colab-0.12.0-py3-none-any.whl (4.2 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━

In [4]:
import scanpy as sc
import torch
import scarches as sca
import numpy as np
import gdown
import pandas as pd

INFO:lightning_fabric.utilities.seed:Global seed set to 0
 captum (see https://github.com/pytorch/captum).


In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [5]:
sc.set_figure_params(frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7)

### Data Upload

In [6]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

input = '/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/input_files/All_cells_5000_HVGs.h5ad'
adata = sc.read_h5ad(input)

Mounted at /gdrive


### Preprocess datasets

In [8]:
adata.obs['dataset'].value_counts()

healthy    156195
cancer      32181
Name: dataset, dtype: int64

In [9]:
Healthy_adata = adata[adata.obs['dataset'] == 'healthy', :]
Cancer_adata = adata[adata.obs['dataset'] == 'cancer', :]

### Create expiMap model and train it on reference dataset

In [10]:
# Create a mask with all ones (assuming all genes are equally important)
Healthy_adata.varm['mask'] = np.ones((Healthy_adata.n_vars, 1))

In [11]:
intr_cvae = sca.models.EXPIMAP(
    adata = Healthy_adata,
    condition_key='Sample_ID',
    hidden_layer_sizes=[256, 256, 256],
    recon_loss='nb',
    mask_key='mask'
)


INITIALIZING NEW NETWORK..............
Encoder Architecture:
	Input Layer in, out and cond: 5000 256 233
	Hidden Layer 1 in/out: 256 256
	Hidden Layer 2 in/out: 256 256
	Mean/Var Layer in/out: 256 1
Decoder Architecture:
	Masked linear layer in, ext_m, ext, cond, out:  1 0 0 233 5000
	with hard mask.
Last Decoder layer: softmax


In [12]:
ALPHA = 0.7

In [13]:
import torch
torch.cuda.is_available()

True

In [14]:
early_stopping_kwargs = {
    "early_stopping_metric": "val_unweighted_loss", # val_unweighted_loss
    "threshold": 0,
    "patience": 50,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}
intr_cvae.train(
    n_epochs=200,
    alpha_epoch_anneal=100,
    alpha=ALPHA,
    alpha_kl=0.5,
    weight_decay=0.,
    early_stopping_kwargs=early_stopping_kwargs,
    use_early_stopping=True,
    monitor_only_val=False,
    seed=2020,
    print_stats=True,
    use_gpu = True,
)

Preparing (156195, 5000)
Instantiating dataset
Init the group lasso proximal operator for the main terms.
Number of deactivated terms: 0
-------------------
 |--------------------| 0.5%  - epoch_loss: 1840.7884237393 - epoch_recon_loss: 1840.7884237393 - epoch_kl_loss: 10.2473825626 - val_loss: 1068.2286346936 - val_recon_loss: 1068.2286346936 - val_kl_loss: 20.1148480275
Number of deactivated terms: 0
-------------------
 |--------------------| 1.0%  - epoch_loss: 1316.2545887340 - epoch_recon_loss: 1316.1368109686 - epoch_kl_loss: 23.5555419627 - val_loss: 1054.2423245790 - val_recon_loss: 1054.1177743380 - val_kl_loss: 24.9105202096
Number of deactivated terms: 0
-------------------
 |--------------------| 1.5%  - epoch_loss: 1289.5180947044 - epoch_recon_loss: 1289.2521563166 - epoch_kl_loss: 26.5936492920 - val_loss: 1049.4211330726 - val_recon_loss: 1049.1540822514 - val_kl_loss: 26.7056393858
Number of deactivated terms: 0
-------------------
 |--------------------| 2.0%  - epoc

In [15]:
MEAN = False

In [16]:
Healthy_adata.obsm['X_cvae'] = intr_cvae.get_latent(mean=MEAN, only_active=True)

In [17]:
sc.pp.neighbors(Healthy_adata, use_rep='X_cvae')

### Initlizling the model for query training

In [18]:
q_intr_cvae = sca.models.EXPIMAP.load_query_data(Cancer_adata, intr_cvae)

View of AnnData object with n_obs × n_vars = 32181 × 5000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'Unified Cell States', 'nFeature_RNA', 'dataset_x', 'iCMS', 'msi', 'dataset_y', 'Tumor Stage', 'MSS/MSI', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS'
    var: 'feature_types-cancer', 'genome-cancer', 'n_cells_by_counts-cancer', 'mean_counts-cancer', 'log1p_mean_counts-cancer', 'pct_dropout_by_counts-cancer', 'total_counts-cancer', 'log1p_total_counts-cancer', 'gene_id-Kong-healthy', 'gene_name-Kong-heal

In [19]:
q_intr_cvae.train(n_epochs=200, alpha_epoch_anneal=100, weight_decay=0., alpha_kl=0.1, seed=2020, use_early_stopping=True, print_stats=True)

Preparing (32181, 5000)
Instantiating dataset
 |████████████████████| 100.0%  - val_loss: 1317.5009619141 - val_recon_loss: 1317.0665136719 - val_kl_loss: 4.3444687271
Saving best state of network...
Best State was in Epoch 198


In [20]:
Cancer_adata.obsm['X_cvae'] = q_intr_cvae.get_latent(mean=MEAN, only_active=True)

In [21]:
adata = sc.AnnData.concatenate(Healthy_adata, Cancer_adata, batch_key='batch_join', uns_merge='same')

In [22]:
from google.colab import drive
drive.mount('/content/gdrive')

q_intr_cvae.save('/content/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/output_files/cancer_model')

adata.write('/content/gdrive/MyDrive/Colab Notebooks/gut_data/cancer_integration/output_files/Epithelial_healthy_and_Joanito_cancer_integrated_andata.h5ad')

Mounted at /content/gdrive
