### Notebook to reformat the Human Lung Cell Atlas Healthy object prior label transfer with `scANVI`

- **Developed by:** Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Department - Helmholtz Munich**
- v230511

### Import required modules

In [1]:
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.2
-----
PIL                 9.4.0
appnope             0.1.3
asttokens           NA
backcall            0.2.0
cffi                1.15.1
colorama            0.4.6
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
executing           1.2.0
h5py                3.8.0
igraph              0.10.4
importlib_resources NA
ipykernel           6.21.2
jedi                0.18.2
joblib              1.2.0
kiwisolver          1.4.4
leidenalg           0.9.1
llvmlite            0.39.1
louvain             0.8.0
matplotlib          3.7.0
mpl_toolkits        NA
natsort             8.2.0
numba               0.56.4
numexpr             2.8.4
numpy               1.23.5
packaging           23.0
pandas              1.5.3
parso               0.8.3
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
platformdirs        3.0.0
prompt_toolkit      3.0.

### Read in Healthy data

In [3]:
hlca_healthy = sc.read_h5ad('../data/HLCA_Healthy_v1.1.h5ad')
hlca_healthy

AnnData object with n_obs × n_vars = 584944 × 28024
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'dataset', 'entropy_dataset_leiden_3', 'entropy_original_ann_level_1_leiden_3', 'entropy_original_ann_level_2_clean_leiden_3', 'entropy_original_ann_level_3_clean_leiden_3', 'entropy_subject_ID_leiden_3', 'fresh_or_frozen', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'leiden_5', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'n_genes_detected', 'original_ann_highest_res', 'original_ann_level_1', '

In [4]:
hlca_healthy.obs['ann_finest_level'].cat.categories

Index(['Basal resting', 'Suprabasal', 'Hillock-like', 'Deuterosomal',
       'Multiciliated (nasal)', 'Multiciliated (non-nasal)',
       'Club (non-nasal)', 'Club (nasal)', 'Goblet (nasal)',
       'Goblet (bronchial)', 'Goblet (subsegmental)', 'AT0',
       'pre-TB secretory', 'Ionocyte', 'Tuft', 'Neuroendocrine',
       'SMG serous (nasal)', 'SMG serous (bronchial)', 'SMG mucous',
       'SMG duct', 'AT1', 'AT2', 'AT2 proliferating', 'EC arterial',
       'EC aerocyte capillary', 'EC general capillary', 'EC venous systemic',
       'EC venous pulmonary', 'Lymphatic EC mature',
       'Lymphatic EC proliferating', 'Lymphatic EC differentiating',
       'Peribronchial fibroblasts', 'Adventitial fibroblasts',
       'Alveolar fibroblasts', 'Pericytes', 'Subpleural fibroblasts',
       'Myofibroblasts', 'Smooth muscle', 'Smooth muscle FAM83D+',
       'SM activated stress response', 'Mesothelium',
       'Hematopoietic stem cells', 'B cells', 'Plasma cells', 'CD4 T cells',
       'CD8 T

### Check if data is raw

In [5]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis = 0))

In [6]:
X_is_raw(hlca_healthy.raw)

True

In [8]:
hlca_healthy_raw = anndata.AnnData(X = hlca_healthy.raw.X, var = hlca_healthy.var, obs = hlca_healthy.obs)
hlca_healthy_raw.obs['seed_labels'] = hlca_healthy_raw.obs['ann_finest_level'].copy()
hlca_healthy_raw.obs['donor'] = hlca_healthy_raw.obs['donor_id'].copy()
hlca_healthy_raw

AnnData object with n_obs × n_vars = 584944 × 28024
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'dataset', 'entropy_dataset_leiden_3', 'entropy_original_ann_level_1_leiden_3', 'entropy_original_ann_level_2_clean_leiden_3', 'entropy_original_ann_level_3_clean_leiden_3', 'entropy_subject_ID_leiden_3', 'fresh_or_frozen', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'leiden_5', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'n_genes_detected', 'original_ann_highest_res', 'original_ann_level_1', '

In [11]:
hlca_healthy_raw.obs['seed_labels'].cat.categories

Index(['Basal resting', 'Suprabasal', 'Hillock-like', 'Deuterosomal',
       'Multiciliated (nasal)', 'Multiciliated (non-nasal)',
       'Club (non-nasal)', 'Club (nasal)', 'Goblet (nasal)',
       'Goblet (bronchial)', 'Goblet (subsegmental)', 'AT0',
       'pre-TB secretory', 'Ionocyte', 'Tuft', 'Neuroendocrine',
       'SMG serous (nasal)', 'SMG serous (bronchial)', 'SMG mucous',
       'SMG duct', 'AT1', 'AT2', 'AT2 proliferating', 'EC arterial',
       'EC aerocyte capillary', 'EC general capillary', 'EC venous systemic',
       'EC venous pulmonary', 'Lymphatic EC mature',
       'Lymphatic EC proliferating', 'Lymphatic EC differentiating',
       'Peribronchial fibroblasts', 'Adventitial fibroblasts',
       'Alveolar fibroblasts', 'Pericytes', 'Subpleural fibroblasts',
       'Myofibroblasts', 'Smooth muscle', 'Smooth muscle FAM83D+',
       'SM activated stress response', 'Mesothelium',
       'Hematopoietic stem cells', 'B cells', 'Plasma cells', 'CD4 T cells',
       'CD8 T

### Subset Epithelial/Stromal cells

In [12]:
hlca_epithelial_raw = hlca_healthy_raw[hlca_healthy_raw.obs['seed_labels'].isin(['Basal resting', 'Suprabasal', 'Hillock-like', 'Deuterosomal',
       'Multiciliated (nasal)', 'Multiciliated (non-nasal)',
       'Club (non-nasal)', 'Club (nasal)', 'Goblet (nasal)',
       'Goblet (bronchial)', 'Goblet (subsegmental)', 'AT0',
       'pre-TB secretory', 'Ionocyte', 'Tuft', 'Neuroendocrine', 
       'AT1', 'AT2', 'AT2 proliferating', 'Myofibroblasts', 'Mesothelium'])]
hlca_epithelial_raw

View of AnnData object with n_obs × n_vars = 278311 × 28024
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'dataset', 'entropy_dataset_leiden_3', 'entropy_original_ann_level_1_leiden_3', 'entropy_original_ann_level_2_clean_leiden_3', 'entropy_original_ann_level_3_clean_leiden_3', 'entropy_subject_ID_leiden_3', 'fresh_or_frozen', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'leiden_5', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'n_genes_detected', 'original_ann_highest_res', 'original_ann_lev

- Remove cells with 'None' annotation

In [13]:
hlca_epistr_clean = hlca_epithelial_raw[~hlca_epithelial_raw.obs['seed_labels'].isin(['None'])]

sc.pp.filter_genes(hlca_epistr_clean, min_cells = 3)

hlca_epistr_clean.obs['seed_labels'].value_counts()

filtered out 405 genes that are detected in less than 3 cells


  adata.var['n_cells'] = number


AT2                          61429
Suprabasal                   41158
Basal resting                38955
Goblet (nasal)               35833
Multiciliated (non-nasal)    35225
Club (nasal)                 26068
Club (non-nasal)              9955
AT1                           7937
Multiciliated (nasal)         4869
Hillock-like                  4600
pre-TB secretory              4393
Goblet (bronchial)            1670
AT0                           1440
Deuterosomal                  1004
AT2 proliferating              976
Goblet (subsegmental)          968
Myofibroblasts                 716
Ionocyte                       561
Mesothelium                    230
Tuft                           165
Neuroendocrine                 159
Name: seed_labels, dtype: int64

In [14]:
hlca_epistr_clean.obs['cell_states'] = hlca_epistr_clean.obs['seed_labels'].copy()
hlca_epistr_clean.obs['cell_states'].cat.categories

Index(['Basal resting', 'Suprabasal', 'Hillock-like', 'Deuterosomal',
       'Multiciliated (nasal)', 'Multiciliated (non-nasal)',
       'Club (non-nasal)', 'Club (nasal)', 'Goblet (nasal)',
       'Goblet (bronchial)', 'Goblet (subsegmental)', 'AT0',
       'pre-TB secretory', 'Ionocyte', 'Tuft', 'Neuroendocrine', 'AT1', 'AT2',
       'AT2 proliferating', 'Myofibroblasts', 'Mesothelium'],
      dtype='object')

### Clean unused `adata.obs`

In [15]:
hlca_epistr_clean

AnnData object with n_obs × n_vars = 278311 × 27619
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'dataset', 'entropy_dataset_leiden_3', 'entropy_original_ann_level_1_leiden_3', 'entropy_original_ann_level_2_clean_leiden_3', 'entropy_original_ann_level_3_clean_leiden_3', 'entropy_subject_ID_leiden_3', 'fresh_or_frozen', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'leiden_5', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'n_genes_detected', 'original_ann_highest_res', 'original_ann_level_1', '

In [19]:
hlca_epistr_clean.obs = hlca_epistr_clean.obs[['ann_finest_level', 'sample', 'disease', 'organism', 'sex', 'tissue', 'seed_labels', 'donor', 'cell_states']]
hlca_epistr_clean

AnnData object with n_obs × n_vars = 278311 × 27619
    obs: 'ann_finest_level', 'sample', 'disease', 'organism', 'sex', 'tissue', 'seed_labels', 'donor', 'cell_states'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'n_cells'

In [20]:
hlca_epistr_clean.var = hlca_epistr_clean.var[['feature_name', 'feature_reference', 'feature_biotype']]
hlca_epistr_clean

AnnData object with n_obs × n_vars = 278311 × 27619
    obs: 'ann_finest_level', 'sample', 'disease', 'organism', 'sex', 'tissue', 'seed_labels', 'donor', 'cell_states'
    var: 'feature_name', 'feature_reference', 'feature_biotype'

### Export object

In [22]:
X_is_raw(hlca_epistr_clean)

True

In [23]:
hlca_epistr_clean.write('../data/HLCA_Healthy_Epithelial_v1.1.h5ad')