### Notebook for the final reference map preparation

- **Developed by:** Anna Maguza
- **Institute of Computational Biology - Computational Health Department - Helmholtz Munich**
- 3rd March 2023

### Import required modules

In [1]:
import anndata
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import scanpy as sc

### Set up working environment

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.4.0
appnope                     0.1.2
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.1
colorama                    0.4.6
comm                        0.1.2
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
executing                   0.8.3
h5py                        3.8.0
hypergeom_ufunc             NA
importlib_resources         NA
invgauss_ufunc              NA
ipykernel                   6.19.2
ipython_genutils            0.2.0
jedi                        0.18.1
joblib                      1.2.0
jupyter_server              1.23.6
kiwisolver                  1.4.4
llvmlite                    0.39.1
matplotlib       

### Upload data

In [None]:
#Upload epithelial cells - cluster path
input_file_epi = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/Epithelial_cells/scANVI/HVG_3000/Epithelial_cells_after_scanvi_3K_HVG.h5ad'
output_file_epi = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/Epithelial_cells/scANVI/HVG_3000/Epithelial_cells_after_scanvi_3K_HVG_output.h5ad'
epithelial = sc.read_h5ad(input_file_epi)

#Upload mesenchymal cells - cluster path
input_file_mes = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/Mesenchymal_cells/scANVI/Mesenchymal_cells_after_scanvi.h5ad'
output_file_mes = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/Mesenchymal_cells/scANVI/Mesenchymal_cells_after_scanvi_output.h5ad'
mesenchymal = sc.read_h5ad(input_file_mes)

#Upload other cells - cluster path
input = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/GCA_and_StemCells_initial/Reference_map_(Gut_cell_atlas+Smilie+Wang).h5ad'
output = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/GCA_and_StemCells_initial/Reference_map_(Gut_cell_atlas+Smilie+Wang)_output.h5ad'
andata = sc.read_h5ad(input)

In [5]:
#Upload epithelial cells - local machine path
input_file_epi = '/Users/anna.maguza/Desktop/Data/Processed_datasets/post_SCANVI/Epithelial/Epithelial_cells_after_scanvi_3K_HVG.h5ad'
output_file_epi = '/Users/anna.maguza/Desktop/Data/Processed_datasets/post_SCANVI/Epithelial/Epithelial_cells_after_scanvi_3K_HVG_output.h5ad'
epithelial = sc.read_h5ad(input_file_epi)

#Upload mesenchymal cells - local machine path
input_file_mes = '/Users/anna.maguza/Desktop/Data/Processed_datasets/post_SCANVI/Mesenchymal/Mesenchymal_cells_after_scanvi.h5ad'
output_file_mes = '/Users/anna.maguza/Desktop/Data/Processed_datasets/post_SCANVI/Mesenchymal/Mesenchymal_cells_after_scanvi_output.h5ad'
mesenchymal = sc.read_h5ad(input_file_mes)

#Upload other cells - local machine path
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Smillie_Wang_unprocessed/Reference_map_(Gut_cell_atlas+Smilie+Wang).h5ad'
output = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Smillie_Wang_unprocessed/Reference_map_(Gut_cell_atlas+Smilie+Wang)_output.h5ad'
andata = sc.read_h5ad(input)

#Upload stem cells - local machine path
input_stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells.h5ad'
output_stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells_output.h5ad'
stem = sc.read_h5ad(input_stem)

### Columns renaming

In [6]:
#Make cell_id as a column
epithelial.obs['cell_id'] = epithelial.obs.index
mesenchymal.obs['cell_id'] = mesenchymal.obs.index

#Split epithelial.obs['cell_id'] into two columns with '-reference' or '-query' as a separators
epithelial.obs[['cell_id', 'cell_id_2']] = epithelial.obs['cell_id'].str.split('-reference', expand = True)
mesenchymal.obs[['cell_id', 'cell_id_2']] = mesenchymal.obs['cell_id'].str.split('-reference', expand = True)

epithelial.obs[['cell_id', 'cell_id_2']] = epithelial.obs['cell_id'].str.split('-query', expand = True)
mesenchymal.obs[['cell_id', 'cell_id_2']] = mesenchymal.obs['cell_id'].str.split('-query', expand = True)

#Make cell_id as an index
epithelial.obs.set_index('cell_id', inplace = True)
mesenchymal.obs.set_index('cell_id', inplace = True)

In [7]:
#Delete unnecessary columns
epithelial.obs.drop('cell_id_2', axis = 1, inplace = True)
mesenchymal.obs.drop('cell_id_2', axis = 1, inplace = True)
epithelial.obs.drop('seed_labels', axis = 1, inplace = True)
mesenchymal.obs.drop('seed_labels', axis = 1, inplace = True)

In [8]:
# Make index as categorical
epithelial.obs.index = epithelial.obs.index.astype('category')
mesenchymal.obs.index = mesenchymal.obs.index.astype('category')
andata.obs.index = andata.obs.index.astype('category')


In [9]:
mesenchymal.obs

Unnamed: 0_level_0,UniqueCell_ID,Sample_ID,CellType,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,...,total_counts_ribo,Location,n_counts,percent_mito,percent_ribo,Cell States,dataset,_scvi_batch,_scvi_labels,C_scANVI
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,...,,,,,,Stem_Cells_ext,reference,63,12,Stromal 3 (KCNN3+)
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,...,,,,,,Stem_Cells_ext,reference,63,12,Stromal 3 (KCNN3+)
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,...,,,,,,Stem_Cells_ext,reference,63,12,Mesoderm 1 (HAND1+)
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,...,,,,,,Stem_Cells_ext,reference,63,12,Mesoderm 1 (HAND1+)
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,...,,,,,,Stem_Cells_ext,reference,63,12,Stromal 3 (KCNN3+)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTGTTCCACTC-1-4918STDY7718977,,,Mesenchymal,Gut Cell Atlas,BRC2134,fetal,10Wk,FLI,SC-EPCAMP,M,...,1298.0,LargeInt,4089.0,1.1313293,31.743702,Mesoderm 1 (HAND1+),query,15,28,Mesoderm 1 (HAND1+)
TTTGTCAAGATCCCAT-1-4918STDY7718977,,,Mesenchymal,Gut Cell Atlas,BRC2134,fetal,10Wk,FLI,SC-EPCAMP,M,...,2747.0,LargeInt,6806.0,1.5818175,40.361446,Mesoderm 1 (HAND1+),query,15,28,Mesoderm 1 (HAND1+)
TTTGTCATCCATGCTC-1-4918STDY7718977,,,Mesenchymal,Gut Cell Atlas,BRC2134,fetal,10Wk,FLI,SC-EPCAMP,M,...,1044.0,LargeInt,3411.0,1.2718816,30.60686,Mesoderm 2 (ZEB2+),query,15,28,Mesoderm 2 (ZEB2+)
TTTGTCATCCCAAGTA-1-4918STDY7718977,,,Mesenchymal,Gut Cell Atlas,BRC2134,fetal,10Wk,FLI,SC-EPCAMP,M,...,2589.0,LargeInt,6895.0,1.3823335,37.548946,Mesoderm 1 (HAND1+),query,15,28,Mesoderm 1 (HAND1+)


In [10]:
#Delete Stem Cell from the mesenchymal cells
mesenchymal = mesenchymal[mesenchymal.obs['CellType'] != 'Stem Cell']

In [11]:
# Concatenate the epithelial and mesenchymal cells
epi_mes = anndata.concat([epithelial, mesenchymal], join = 'outer', index_unique = None)

In [12]:
# Create a dictionary to map barcodes to cell labels
barcode_to_label = {}

# Transfer cell labels from 'epi_mes'
for barcode, label in zip(epi_mes.obs.index, epi_mes.obs['C_scANVI']):
    barcode_to_label[barcode] = label

# Transfer cell labels from 'all_cells' where there are NA values
for barcode, label in zip(andata.obs.index, andata.obs['Cell States']):
    if barcode not in barcode_to_label or pd.isna(barcode_to_label[barcode]):
        barcode_to_label[barcode] = label

In [13]:
# Create a new column in 'all_cells' with the transferred cell labels
andata.obs['Cell Label'] = [barcode_to_label.get(barcode, None) for barcode in andata.obs.index]

In [14]:
andata.obs

Unnamed: 0_level_0,UniqueCell_ID,Sample_ID,CellType,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,...,doublet_scores,predicted_doublets,Age_group,total_counts_ribo,Location,n_counts,percent_mito,percent_ribo,Cell States,Cell Label
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPA.TCGGGACGTCAACTGT,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,4384,,,Stem_Cells_ext,Stem_Cells_ext
N110.LPA.TGAGCATTCCAGTAGT,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,15655,,,Stem_Cells_ext,Stem_Cells_ext
N110.LPA.TGGCCAGAGAGGACGG,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,26921,,,Stem_Cells_ext,Stem_Cells_ext
N110.LPA.TTCTTAGCAGTCCTTC,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,22468,,,Stem_Cells_ext,Stem_Cells_ext


In [87]:
#Write the file - cluster path
andata.write_h5ad('/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/All_cell_types/All_cell_types_after_scanvi.h5ad')

In [15]:
#Write the file - local machine path
andata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/Healthy_reference_predicted/SCANVI_predicted_cell_types_1.h5ad')

### Second dataset - with 15K stem cells

In [27]:
#Upload all cells - local machine path
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Smillie_Wang_unprocessed/Reference_map_(Gut_cell_atlas+Smilie+Wang).h5ad'
output = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Smillie_Wang_unprocessed/Reference_map_(Gut_cell_atlas+Smilie+Wang)_output.h5ad'
andata = sc.read_h5ad(input)

#Upload stem cells - local machine path
input_stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells.h5ad'
output_stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells_output.h5ad'
stem = sc.read_h5ad(input_stem)

In [29]:
andata.obs['Predicted Cell Labels'] = andata.obs['Cell States']

In [31]:
stem.obs

Unnamed: 0_level_0,UniqueCell_ID,Sample_ID,CellType,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,...,doublet_scores,predicted_doublets,Age_group,total_counts_ribo,Location,n_counts,percent_mito,percent_ribo,Cell States,Cell Label
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,,,,,,...,,,,,,,,,Stem_Cells_ext,Stem_Cells_ext
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPA.TCGGGACGTCAACTGT,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,4384,,,Stem_Cells_ext,Stem_Cells_ext
N110.LPA.TGAGCATTCCAGTAGT,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,15655,,,Stem_Cells_ext,Stem_Cells_ext
N110.LPA.TGGCCAGAGAGGACGG,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,26921,,,Stem_Cells_ext,Stem_Cells_ext
N110.LPA.TTCTTAGCAGTCCTTC,,N110.LPA,Stem Cell,Smilie,N110,Non-inflamed,,,,,...,,,,,LP,22468,,,Stem_Cells_ext,Stem_Cells_ext


In [32]:
# Create a dictionary to map barcodes to cell labels
barcode_to_label = {}

# Transfer cell labels from 'epi_mes'
for barcode, label in zip(stem.obs.index, stem.obs['Cell Label']):
    barcode_to_label[barcode] = label

# Transfer cell labels from 'all_cells' where there are NA values
for barcode, label in zip(andata.obs.index, andata.obs['Cell States']):
    if barcode not in barcode_to_label or pd.isna(barcode_to_label[barcode]):
        barcode_to_label[barcode] = label

In [33]:
# Create a new column in 'all_cells' with the transferred cell labels
andata.obs['Predicted Cell Labels'] = [barcode_to_label.get(barcode, None) for barcode in andata.obs.index]

In [35]:
df = andata.obs['Predicted Cell Labels'].value_counts()

In [36]:
#Write the file - local machine path
andata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/Healthy_reference_predicted/SCANVI_predicted_cell_types_15K_stem_cells.h5ad')

### Reference Object Description

In [5]:
# Data upload
input = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/All_cell_types/All_cell_types_after_scanvi.h5ad'
output = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Reference_maps/GCA_Stem_cells/All_cell_types/All_cell_types_after_scanvi_output.h5ad'
andata = sc.read_h5ad(input)



In [15]:
andata

AnnData object with n_obs × n_vars = 375293 × 40144
    obs: 'UniqueCell_ID', 'Sample_ID', 'CellType', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', '10X', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts_mt', 'doublet_scores', 'predicted_doublets', 'Age_group', 'total_counts_ribo', 'Location', 'n_counts', 'percent_mito', 'percent_ribo', 'Cell States', 'Cell Label'

In [16]:
df = andata.obs['Cell Label'].value_counts()

In [17]:
# Make a column with 'TRUE' if the cell is Stem Cell in the 'Cell Label' column
andata.obs['Stem Cell'] = andata.obs['Cell Label'] == 'Stem_Cells_ext'