### Notebook for the label transfer from Healthy-CTRL cells to other samples with `scNym`.

- **Developed by**: Carlos Talavera-López
- **Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich**
- v221012

### Load required modules

In [None]:
import time
import scnym
import anndata
import scipy as sp
import numpy as np
import pandas as pd
import scanpy as sc

### Set up working environment

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
config_name = "new_identity_discovery"
config = scnym.api.CONFIGS[config_name]
config["domain_groupby"] = "domain_label"

### Read in combined reference-query object

In [None]:
healthy_raw = sc.read_h5ad('../data/BrEpit_Healthy_scnym_annotated_ctl221101.h5ad')
healthy_raw

In [None]:
reference = healthy_raw[~healthy_raw.obs['scNym'].isin(['Duct', 'Squamous', 'Transit epi 1',
       'Transit epi 2'])]
reference

### Format data as reference for `scNym`

In [None]:
reference.obs['group'] = 'healthy_ctrl'
reference.obs['cell_states'] = reference.obs['scNym'].copy()
reference.obs['domain_label'] = reference.obs['batch'].copy()
reference.obs['domain_label'] = 'train_' + reference.obs['domain_label'].astype(str)
reference.obs['domain_label'] = reference.obs['domain_label'].astype('category')
reference.obs['domain_label'].cat.categories

### Read in queries

In [None]:
healthy_iav = sc.read_h5ad('../data/BrEpit_Healthy_IAV_ctl221011.raw.h5ad')
healthy_iav.obs['cell_states'] = 'Unlabeled'
healthy_iav

In [None]:
copd_ctrl = sc.read_h5ad('../data/BrEpit_COPD_CTRL_ctl221011.raw.h5ad')
copd_ctrl.obs['cell_states'] = 'Unlabeled'
copd_ctrl

In [None]:
copd_iav = sc.read_h5ad('../data/BrEpit_COPD_IAV_ctl221011.raw.h5ad')
copd_iav.obs['cell_states'] = 'Unlabeled'
copd_iav

In [None]:
query = healthy_iav.concatenate(copd_ctrl, copd_iav, batch_key = 'group', batch_categories = ['healthy_iav', 'copd_ctrl', 'copd_iav'], join = 'inner')
query

### Format data as query for `scNym`

In [None]:
query.obs['domain_label'] = query.obs['batch'].copy()
query.obs['domain_label'] = 'target_' + query.obs['domain_label'].astype(str)
query.obs['domain_label'] = query.obs['domain_label'].astype('category')
query.obs['domain_label'].cat.categories

### Merge query+reference datasets 

In [None]:
combined_object = query.concatenate(reference, batch_key = 'object', batch_categories = ['query', 'reference'], join = 'inner')
combined_object

### Normalise data for scNym

In [None]:
 combined_object_raw = combined_object.copy()

 sc.pp.normalize_total(combined_object, target_sum = 1e6, exclude_highly_expressed = True)
 sc.pp.log1p(combined_object)

### Train reference with `scNym`

- Record start time for `scNym` training

In [None]:
start_time = time.time()

- Train model

In [None]:
scnym.api.scnym_api(
    adata = combined_object,
    task = 'train',
    groupby = 'cell_states',
    domain_groupby = 'domain_label',
    out_path = '/home/cartalop/data/single_cell/lung/influenza/scnym_models/all_data/',
    config = 'new_identity_discovery',
)

- Record end time for scNym label transfer

In [None]:
end_time = time.time()

- Compute the elapsed time

In [None]:
total_time = end_time - start_time
print(f"Total time: {total_time}")

### Predict cell labels

In [None]:
from scnym.api import scnym_api

scnym_api(
    adata = combined_object,
    task = 'predict',
    key_added = 'scNym',
    trained_model = '/home/cartalop/data/single_cell/lung/influenza/scnym_models/all_data/',
    out_path = '/home/cartalop/data/single_cell/lung/influenza/scnym_models/all_data/',
    config = 'new_identity_discovery',
)


### Visualise label transfer and cofindence using `X_scNym`

In [None]:
sc.pp.neighbors(combined_object, use_rep = 'X_scnym', n_neighbors = 50)
sc.tl.umap(combined_object, min_dist = 0.2, spread = 5, random_state = 1712)
sc.pl.umap(combined_object, color = ['scNym', 'scNym_confidence', 'group', 'cell_states'], size = 0.3, frameon = False, legend_loc = 'on data', legend_fontsize = 4)

In [None]:
sc.pl.umap(combined_object, color = ['group', 'object', 'disease', 'infection', 'scNym_confidence', 'scNym'], size = 0.3, frameon = False, legend_fontsize = 5, ncols = 3)

In [None]:
sc.pl.umap(combined_object, color = ['object'], groups = ['query'], size = 0.8, frameon = False, legend_fontsize = 5, ncols = 3)

In [None]:
combined_object

### Save object

In [None]:
adata_export = anndata.AnnData(X = combined_object_raw.X, obs = combined_object.obs, var = combined_object.var, uns = combined_object.uns, obsm = combined_object.obsm, obsp = combined_object.obsp)
adata_export

### Subset query cells only 