#### Create reference dataset with adult healthy colon cells for transfer learning of Xenium add-on panel data with `scANVI`
- **Developed by:** Anna Maguza
- **Affilation:** Faculty of Medicine, Würzburg University
- **Date of creation:** 31th July 2024
- **Last modified date:** 31th July 2024

##### Import packages

In [1]:
import numpy as np
import scanpy as sc
import pandas as pd

### Read in data

In [2]:
adata_ref = sc.read_h5ad('data/Healthy_integrated_data_all_genes.h5ad')

+ leave only adult data

In [3]:
condition_to_keep = ['Healthy adult', 'Adult Ulcerative Colitis Non-inflamed']
adata_ref = adata_ref[adata_ref.obs['Diagnosis'].isin(condition_to_keep), :]

+ Leave only colon data

In [4]:
condition_to_keep = ['Large Intestine', 'Rectum', 'Epithelium', 'Lamina Propria']
adata_ref = adata_ref[adata_ref.obs['Location'].isin(condition_to_keep), :]

+ Modify cell states for easier processing

In [5]:
mapping_states = {
    'Stromal 1 (ADAMDEC1+)': 'Fibroblasts',
    'Stromal 2 (NPY+)': 'Fibroblasts',
    'Stromal 3 (C7+)': 'Fibroblasts',
    'Stromal 3 (KCNN3+)': 'Fibroblasts',
    'Stromal 2 (CH25H+)': 'Fibroblasts',
    'Fibroblasts ADAMDEC1': 'Fibroblasts',
    'Fibroblasts SMOC2 PTGIS': 'Fibroblasts',
    'Stromal 1 (CCL11+)': 'Fibroblasts',
    'Fibroblasts KCNN3 LY6H': 'Fibroblasts',
    'Fibroblasts SFRP2 SLPI': 'Fibroblasts',
    'Fibroblasts NPY SLITRK6': 'Fibroblasts',
    'Activated fibroblasts CCL19 ADAMADEC1': 'Fibroblasts',
    'Inflammatory fibroblasts IL11 CHI3L1': 'Fibroblasts',
    'Stromal 4 (MMP1+)': 'Fibroblasts',
    'Transitional Stromal 3 (C3+)': 'Fibroblasts',
    'cycling stromal': 'Fibroblasts',
    'mLN Stroma (FMO2+)': 'Fibroblasts',
    'Inflammatory fibroblasts IL11 CHI3L1': 'Fibroblasts',
    'Stromal Cycling cells': 'Fibroblasts',

    'Mesoderm 2 (ZEB2+)': 'Mesoderm',
    'Mesoderm 1 (HAND1+)': 'Mesoderm',

    'SMC (PLPP2+)': 'SMC',
    'SMC (PART1/CAPN3+)': 'SMC',

    'myofibroblast (RSPO2+)': 'Myofibroblasts',
    'Myofibroblasts GREM1 GREM2': 'Myofibroblasts',
    'Myofibroblasts HHIP NPNT': 'Myofibroblasts',
    'myofibroblast': 'Myofibroblasts',

    'Immature pericyte': 'Pericytes',
    'Contractile pericyte (PLN+)': 'Pericytes',
    'angiogenic pericyte': 'Pericytes',
    'Pericytes HIGD1B STEAP4': 'Pericytes',
    'Pericyte': 'Pericytes',
    'Pericytes RERGL NTRK2': 'Pericytes',

    'Mesothelium': 'Mesothelium',
    'Mesothelium (RGS5+)': 'Mesothelium',
    'Mesothelium (PRG4+)': 'Mesothelium',

    'T cells Naive CD4': 'CD4 T',
    'T cells CD4 IL17A': 'CD4 T',
    'Th1': 'CD4 T',
    'SELL+ CD4 T': 'CD4 T',
    'IELs ID3 ENTPD1': 'CD8 T',
    'Th17': 'CD4 T',
    'Tfh': 'CD4 T',
    'T cells CD4 FOSB': 'CD4 T',
    'Tregs': 'Tregs',
    'Treg': 'Tregs',
    'Activated CD4 T': 'CD4 T',
    'Activated T': 'CD4 T',
    'Activated CD8 T': 'CD8 T',
    'CX3CR1+ CD8 Tmem': 'CD8 T',
    'T cells CD8': 'CD8 T',
    'T cells CD8 KLRG1': 'CD8 T',
    'CD8 Tmem': 'CD8 T',
    'SELL+ CD8 T': 'CD8 T',
    'NK-like cells ID3 ENTPD1': 'NK',
    'NK cell': 'NK',
    'NK cells KLRF1 CD3G-': 'NK',
    'NK cells KLRF1 CD3G-': 'NK',
    'NK T cell': 'NK',
    'gdT': 'gdT',
    'TRGV5/7 gdT': 'gdT',
    'TRGV2 gdT': 'gdT',
    'TRGV4 gdT': 'gdT',
    'TRDV2/TRGV9 gdT': 'gdT',
    'ILCs': 'ILCs',
    'ILC3': 'ILCs',
    'ILCP': 'ILCs',
    'ILC2': 'ILCs',
    'LTi-like NCR- ILC3': 'ILCs',
    'LTi-like NCR+ ILC3': 'ILCs',
    
    'Macrophages': 'Macrophages',
    'Macrophages CCL3 CCL4': 'Macrophages',
    'LYVE1+ Macrophage': 'Macrophages',
    'Macrophages LYVE1': 'Macrophages',
    'Macrophages Metallothionein': 'Macrophages',
    'MMP9+ Inflammatory macrophage': 'Macrophages',
    'Macrophages PLA2G2D': 'Macrophages',
    'Macrophages CXCL9 CXCL10': 'Macrophages',

    'Monocytes': 'Monocytes',
    'Monocytes HBB': 'Monocytes',
    'Monocytes CHI3L1 CYP27A1': 'Monocytes',
    'Monocytes S100A8 S100A9': 'Monocytes',
    'MPO+ mono-neutrophil': 'Monocytes',

    'DC2 CD1D': 'DC',
    'DC2 CD1D-': 'DC',
    'cDC2': 'DC',
    'DC1': 'DC',
    'cDC1': 'DC',
    'pDC': 'DC',
    'Lymphoid DC': 'DC',
    'FDC': 'DC',
    'Mature DCs': 'DC',

    'Mast cells': 'Mast cells',
    'Mast cell': 'Mast cells',
    'CLC+ Mast cell': 'Mast cells',

    'Immune Cycling cells': 'Immune Cycling cells',

    'Megakaryocyte': 'Megakaryocytes',
    
    'Goblet cell': 'Goblet cells',
    'BEST2+ Goblet cell': 'Goblet cells',
    'Goblet cells MUC2 TFF1': 'Goblet cells',
    'Goblet cells SPINK4': 'Goblet cells',
    'Goblet cells MUC2 TFF1-': 'Goblet cells',
    'Paneth': 'Paneth cells',
    'Paneth cells': 'Paneth cells',
    'TA': 'TA',
    'Tuft': 'Tuft cells',
    'Tuft cells': 'Tuft cells',
    'Enterocyte': 'Enterocyte',
    'Enterocytes BEST4': 'Enterocyte',
    'Enterocytes TMIGD1 MEP1A GSTA1': 'Enterocyte',
    'Enterocytes TMIGD1 MEP1A': 'Enterocyte',
    'Enterocytes CA1 CA2 CA4-': 'Enterocyte',
    'Stem cells OLFM4': 'Stem cells',
    'Stem cells OLFM4 GSTA1': 'Stem cells',
    'Stem cells OLFM4 LGR5': 'Stem cells',
    'Stem cells OLFM4 PCNA': 'Stem cells',
    'Stem_Cells_GCA': 'Stem cells',
    'Stem_Cells_ext': 'Stem cells',
    'Colonocyte': 'Colonocyte',
    'Proximal progenitor': 'Proximal progenitor',
    'Distal progenitor': 'Distal progenitor',
    'EECs' : 'EECs', 
    'Enteroendocrine cells': 'EECs', 
    'M/X cells (MLN/GHRL+)': 'EECs', 
    'CLDN10+ cells': 'EECs',
    'I cells (CCK+)': 'EECs',
    'L cells': 'EECs',
    'D cells (SST+)': 'EECs',
    'L cells (PYY+)': 'EECs',
    'K cells (GIP+)': 'EECs',
    'Enterochromaffin cells': 'EECs',
    'EC cells (NPW+)': 'EECs',
    'N cells (NTS+)': 'EECs',
    'β cells (INS+)': 'EECs',

    'Endothelial cells CD36': 'Endothelial cells',
    'Endothelial cells DARC': 'Endothelial cells',
    'Endothelial cells LTC4S SEMA3G': 'Endothelial cells',

    'LEC6 (ADAMTS4+)': 'LEC',
    'LEC1 (ACKR4+)': 'LEC',
    'LEC3 (ADGRG3+)': 'LEC',
    'LEC5 (CLDN11+)': 'LEC',
    'LEC4 (STAB2+)': 'LEC',

    'Adult Glia' : 'Glial cells',
    'Glial cells' : 'Glial cells',

    'Epithelial Cycling cells': 'TA',

    'IgA plasma cell': 'Plasma cells',

    'Memory B': 'B cells',
    'Naive B': 'B cells',
    'B cells AICDA LRMP': 'B cells',
    'Cycling B cell' : 'B cells',
}

In [6]:
adata_ref.obs['seed_labels'] = adata_ref.obs['Cell States'].astype('category')

  adata_ref.obs['seed_labels'] = adata_ref.obs['Cell States'].astype('category')


In [7]:
existing_categories = set(adata_ref.obs['seed_labels'].cat.categories)

new_categories = set(mapping_states.values())

categories_to_add = new_categories - existing_categories

adata_ref.obs['seed_labels'] = adata_ref.obs['seed_labels'].cat.add_categories(categories_to_add)

adata_ref.obs.loc[adata_ref.obs['Cell States'].isin(mapping_states.keys()), 'seed_labels'] = \
    adata_ref.obs['Cell States'].map(mapping_states)

+ Remove cell states with less than 10 cells

In [11]:
cell_types = adata_ref.obs['seed_labels'].value_counts()
cell_types = cell_types[cell_types >= 10].index
adata_ref = adata_ref[adata_ref.obs['seed_labels'].isin(cell_types), :]

In [14]:
adata_ref.write('data/Healthy_colon_adult.h5ad')