## Notebook for Epithelial labels transfer and datasets integration 

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 15th May 2023

### Import packages

In [1]:
import scanpy as sc
import torch
import scarches as sca
import numpy as np
import gdown

 captum (see https://github.com/pytorch/captum).
INFO:lightning_fabric.utilities.seed:Global seed set to 0
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [3]:
sc.set_figure_params(frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7)

### Data Upload

In [4]:
input_cancer = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_Colon_cancer_atlas_anndata.h5ad'
cancer_andata = sc.read_h5ad(input_cancer)
input_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_Healthy_anndata.h5ad'
healthy_andata = sc.read_h5ad(input_healthy)

### Create expiMap model and train it on reference dataset

In [5]:
healthy_andata.obs_keys

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 210075 × 5000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'Age_group_colors', 'Diagnosis_colors', 'Donor_ID_colors', 'Gender_colors', 'Library_Preparation_Protocol_colors', 'Location_colors', 'Study_name_colors', '_scvi_manager_uuid', '_scvi_uuid', 

In [39]:
df= healthy_andata.obs['Cell States Kong'].value_counts()

In [40]:
# Create a mask with all ones (assuming all genes are equally important)
healthy_andata.varm['mask'] = np.ones((healthy_andata.n_vars, 1))

In [41]:
healthy_andata

AnnData object with n_obs × n_vars = 210075 × 5000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'Age_group_colors', 'Diagnosis_colors', 'Donor_ID_colors', 'Gender_colors', 'Library_Preparation_Protocol_colors', 'Location_colors', 'Study_name_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'log1p', 'neighbors', 'umap

In [42]:
intr_cvae = sca.models.EXPIMAP(
    adata = healthy_andata,
    condition_key='Cell States',
    hidden_layer_sizes=[256, 256, 256],
    recon_loss='nb',
    mask_key='mask'
)


INITIALIZING NEW NETWORK..............
Encoder Architecture:
	Input Layer in, out and cond: 5000 256 45
	Hidden Layer 1 in/out: 256 256
	Hidden Layer 2 in/out: 256 256
	Mean/Var Layer in/out: 256 1
Decoder Architecture:
	Masked linear layer in, ext_m, ext, cond, out:  1 0 0 45 5000
	with hard mask.
Last Decoder layer: softmax


In [43]:
ALPHA = 0.7

In [44]:
early_stopping_kwargs = {
    "early_stopping_metric": "val_unweighted_loss", # val_unweighted_loss
    "threshold": 0,
    "patience": 50,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}
intr_cvae.train(
    n_epochs=400,
    alpha_epoch_anneal=100,
    alpha=ALPHA,
    alpha_kl=0.5,
    weight_decay=0.,
    early_stopping_kwargs=early_stopping_kwargs,
    use_early_stopping=True,
    monitor_only_val=False,
    seed=2020,
)

Init the group lasso proximal operator for the main terms.
 |████████████████████| 100.0%  - epoch_loss: 752.4581540821 - epoch_recon_loss: 750.0579779887 - epoch_kl_loss: 4.8003519578 - val_loss: 712.0512367807 - val_recon_loss: 709.6358690960 - val_kl_loss: 4.830735436327
Saving best state of network...
Best State was in Epoch 398


In [45]:
MEAN = False

In [46]:
healthy_andata.obsm['X_cvae'] = intr_cvae.get_latent(mean=MEAN, only_active=True)

### Initlizling the model for query training

In [59]:
cancer_andata.obs['Cell States'] = 'Unlabeled'

In [54]:
# Make index as a column in cancer_andata.var
cancer_andata.var['gene_id'] = cancer_andata.var.index

In [61]:
cancer_andata.X

<113593x43282 sparse matrix of type '<class 'numpy.float32'>'
	with 403347148 stored elements in Compressed Sparse Column format>

In [62]:
healthy_andata.X

<210075x5000 sparse matrix of type '<class 'numpy.float32'>'
	with 76008049 stored elements in Compressed Sparse Row format>

In [6]:
# Make 'gene_name' as index in cancer_andata.var
cancer_andata.var.set_index('gene_name', inplace=True)

In [None]:
# Make indexes as string in cancer_andata.var
cancer_andata.var.index = cancer_andata.var.index.astype(str)

In [63]:
cancer_andata.X = cancer_andata.X.todense()

In [None]:
# Check if there are duplicate gene names in your query dataset
if not cancer_andata.var_names.is_unique:
    cancer_andata.var_names_make_unique()

In [64]:
q_intr_cvae = sca.models.EXPIMAP.load_query_data(cancer_andata, intr_cvae)



Query data is missing expression data of  312  genes which were contained in the reference dataset.
The missing information will be filled with zeroes.


: 

: 

In [None]:
q_intr_cvae.train(n_epochs=400, alpha_epoch_anneal=100, weight_decay=0., alpha_kl=0.1, seed=2020, use_early_stopping=True)