## Notebook for transferring labels from Healthy epithelial reference to cancer cells using `scBalance`

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 8th June 2023

### Load required modules

In [1]:
import scBalance as sb
import scBalance.scbalance_IO as ss
import scanpy as sc
import pandas as pd
import numpy as np

### Data upload

In [105]:
input_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Healthy_epithelial_cells_all_genes.h5ad'
Healthy_adata = sc.read(input_healthy)

In [106]:
input_cancer = '/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_tumor_cells.h5ad'
Cancer_adata = sc.read(input_cancer)

In [107]:
# Filter epithelial cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['Cell Type'] == 'Epithelial',:]

In [108]:
# Remove Paneth cells from healthy epithelial cells
#Healthy_adata = Healthy_adata[Healthy_adata.obs['Unified Cell States'] != 'Paneth cells',:]

### Preprocess

In [109]:
Healthy_adata.layers['counts'] = Healthy_adata.X.copy()

### HVGs selection
# Calculate HVGs for cancer dataset
sc.pp.highly_variable_genes(
    Healthy_adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [110]:
Cancer_adata.layers['counts'] = Cancer_adata.X.copy()

# Extract same HVGs in the cancer dataset as in the healthy dataset

#Make indexes as string
Cancer_adata.var.index = Cancer_adata.var.index.astype(str)

# Ensure indexes are unique
Cancer_adata.var_names_make_unique()

# Identify common genes
common_genes = list(set(Healthy_adata.var_names) & set(Cancer_adata.var_names))

# Filter genes
Healthy_adata = Healthy_adata[:, common_genes]
Cancer_adata = Cancer_adata[:, common_genes]

#Ensure the same order of the genes
Cancer_adata = Cancer_adata[:, Healthy_adata.var_names]

In [111]:
sc.pp.normalize_total(Healthy_adata, target_sum=1e4)
sc.pp.log1p(Healthy_adata)

sc.pp.normalize_total(Cancer_adata, target_sum=1e4)
sc.pp.log1p(Cancer_adata)

  view_to_actual(adata)


In [112]:
gene = Healthy_adata.var_names & Cancer_adata.var_names

  gene = Healthy_adata.var_names & Cancer_adata.var_names


In [113]:
X_train = Healthy_adata.to_df()[gene]
X_test = Cancer_adata.to_df()[gene]

In [114]:
y_train = pd.DataFrame(Healthy_adata.obs['Unified Cell States'])

# Rename 'Unified Cell States' column in the dataframe to 'Label' to be consistent with the scBalance input
y_train = y_train.rename(columns={'Unified Cell States': 'Label'})

In [115]:
# Convert y_train to category and store the categories
y_train['Label'] = y_train['Label'].astype('category')
categories = y_train['Label'].cat.categories

# Convert to integer codes
y_train_values = y_train['Label'].cat.codes.values

# then, convert it back to DataFrame for sb.scBalance()
y_train = pd.DataFrame(y_train_values, columns=['Label'])

In [116]:
pred_result = sb.scBalance(X_test, X_train, y_train, processing_unit = 'cpu', weighted_sampling = True)

--------Start annotating----------
Computational unit be used is: cpu
--------Annotation Finished----------


In [117]:
# Create a DataFrame from the prediction result
pred_result_df = pd.DataFrame(pred_result, columns=['Label'])

# Map integers back to original labels using categories
pred_result_df['Label'] = categories[pred_result_df['Label']]

In [118]:
# Convert dataframe to numpy array
pred_result_array = pred_result_df['Label'].values

# Assign the numpy array to the 'Predicted Label' column in your AnnData object
Cancer_adata.obs['Predicted Label'] = pred_result_array

In [119]:
Cancer_adata.obs['Predicted Label'].value_counts()

TA                        34035
Enterocyte                  698
Tuft cells                  332
Epithelial cells            306
Colonocyte                  290
Enteroendocrine cells        35
L cells                       7
Microfold cell                6
Enterochromaffin cells        3
Stem cells                    2
Name: Predicted Label, dtype: int64

In [120]:
# Save the output
Cancer_adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/output/Epithelial/Joanito_predicted_labels_with_scBalance_3000HVGs_no_Paneth.h5ad')

In [131]:
# Save the output
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/output/Epithelial/Joanito_predicted_labels_with_scBalance_2000HVGs.h5ad'
adata = sc.read(input)

In [132]:
adata.obs['Predicted Label'].value_counts()

Paneth cells              20805
Enterocyte                14489
Tuft cells                  179
Epithelial cells            109
Enteroendocrine cells       107
L cells                       9
TA                            7
Microfold cell                4
Enterochromaffin cells        3
Stem cells                    2
Name: Predicted Label, dtype: int64