### Notebook to label transfer with `scnym` of the Kong_2023 to Predicted GCA stem cells

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 21st April 2023

#### Load required packages

In [1]:
import scnym

tensorflow is not installed, assuming tensorboard is independent


In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import torch

#### Setup Cells


In [6]:
%matplotlib inline

In [3]:
import os
os.chdir('../')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [4]:
sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7)

In [5]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [7]:
# Read reference (Kong-2023 dataset)
input_Kong = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Raw_anndata/Kong_2023_raw_anndata.h5ad'
output_Kong = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Processed_anndata/Kong_2023_raw_anndata_output.h5ad'
reference = sc.read(input_Kong)

In [8]:
# Read query (predicted Stem cells)
input_Stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells.h5ad'
output_Stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells_output.h5ad'
query = sc.read(input_Stem)

### Preprocess datasets

In [9]:
X_is_raw(query)

True

In [10]:
X_is_raw(reference)

True

In [11]:
# Filter out only healthy samples
reference = reference[reference.obs['disease__ontology_label'] == 'normal']

In [12]:
# Copy gene_id column in query.var as a first column in query.var with a name 'gene_name'
query.var.insert(0, 'gene_name', query.var.index)
# Make a gene name as index
query.var.index = query.var['gene_name']
# Delete gene_name column
del query.var['gene_name']

In [13]:
# Filter cells that have 'Stem cells' in reference.obs['Celltype']
stem_cells_list = ['Stem cells OLFM4 LGR5', 'Stem cells OLFM4 PCNA', 'Stem cells OLFM4 GSTA1', 'Stem cells OLFM4']
reference = reference[reference.obs['Celltype'].isin(stem_cells_list)]

In [14]:
# Rename 'donor_id' column in reference.obs to 'Donor_ID'
reference.obs.rename(columns = {'donor_id': 'Donor_ID'}, inplace = True)
reference.obs['Study_name'] = 'Kong 2023'
reference.obs.rename(columns = {'biosample_id': 'Sample_ID'}, inplace = True)
reference.obs.rename(columns = {'tissue': 'Location'}, inplace = True)
#reference.obs.rename(columns = {'cell_type': 'Celltype'}, inplace = True)
query.obs.rename(columns = {'10X': 'library_preparation_protocol__ontology_label'}, inplace = True)

In [15]:
# Make gene_id as a first column in adata.var
reference.var.insert(0, 'gene_id', reference.var.index)
# Make a gene name as index 
reference.var.index = reference.var['gene_name']

reference.obs_names_make_unique()
query.obs_names_make_unique()
reference.var_names_make_unique()
query.var_names_make_unique()

In [16]:
#Creating a new column in obs
reference.obs["seed_labels"] = reference.obs["Celltype"]
query.obs["seed_labels"] = "Unlabeled"

#### LogP1 normalization


In [17]:
#Log normalization of both datasets
sc.pp.log1p(reference)
sc.pp.log1p(query)

### Merging of two datasets

In [18]:
# Concatenate reference and query
adata = reference.concatenate(query, batch_key = 'dataset', batch_categories = ['reference', 'query'])

In [None]:
del reference, query

### Python API (neural network training)

In [None]:
from scnym.api import scnym_api

In [None]:
#Show cell types
adata.obs['seed_labels'].value_counts()

### Predict cell labels

In [None]:
scnym_api(
    adata=adata,
    task='train',
    groupby='seed_labels',
    out_path='./scnym_output',
    config='new_identity_discovery',
)

In [None]:
scnym_api(
    adata=adata,
    task='predict',
    key_added='scNym',
    trained_model='./scnym_output',
    out_path='./scnym_output',
    config='new_identity_discovery',
)

In [None]:
adata.obs['scNym'].value_counts

### Visualise label transfer and cofindence using X_scNym

In [None]:
sc.pp.neighbors(adata, use_rep = 'X_scnym', n_neighbors = 50)
sc.tl.umap(adata, min_dist = 0.2, spread = 5, random_state = 1712)

In [None]:
sc.pl.umap(adata, color=['scNym', 'scNym_confidence', "Study"], color_map = "magma", size = 0.3, frameon = False, legend_loc = 'on data', legend_fontsize = 4)

### Save object