### Notebook to reformat the healthy PBMCs from [Yoshida 2022](https://www.nature.com/articles/s41586-021-04345-x) prior label transfer with `scANVI`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- v230703

### Import required modules

In [1]:
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.2
-----
PIL                 9.4.0
appnope             0.1.3
asttokens           NA
backcall            0.2.0
cffi                1.15.1
colorama            0.4.6
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
executing           1.2.0
h5py                3.8.0
igraph              0.10.4
importlib_resources NA
ipykernel           6.21.2
jedi                0.18.2
joblib              1.2.0
kiwisolver          1.4.4
leidenalg           0.9.1
llvmlite            0.39.1
louvain             0.8.0
matplotlib          3.7.0
mpl_toolkits        NA
natsort             8.2.0
numba               0.56.4
numexpr             2.8.4
numpy               1.23.5
packaging           23.0
pandas              1.5.3
parso               0.8.3
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
platformdirs        3.0.0
prompt_toolkit      3.0.

### Read in Healthy data

In [3]:
yoshida_pbmc = sc.read_h5ad('../data/meyer_nikolic_covid_pbmc_raw.h5ad')
yoshida_pbmc

AnnData object with n_obs × n_vars = 422220 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'

In [4]:
yoshida_pbmc.obs['COVID_status'].cat.categories

Index(['COVID-19', 'Healthy', 'Post-COVID-19'], dtype='object')

In [8]:
yoshida_pbmc.obs['annotation_detailed'].cat.categories

Index(['HPC IFN stim', 'B n-sw mem IFN stim', 'B naive IFN stim',
       'Monocyte CD16 IFN stim', 'Monocyte CD14 IFN stim', 'NK IFN stim',
       'T CD8 CTL IFN stim', 'T CD4 naive IFN stim', 'RBC', 'Platelets',
       'Cycling', 'Baso/Eos', 'HPC', 'Plasmablasts', 'Plasma cells', 'B invar',
       'B sw mem', 'B n-sw mem', 'B naive', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'Monocyte CD16+C1', 'Monocyte CD16', 'Monocyte CD14 IL6',
       'Monocyte CD14', 'ILC', 'NK CD56', 'NK', 'NKT', 'MAIT', 'T reg',
       'T g/d', 'T CD8 CTL', 'T CD8 EMRA', 'T CD8 EM', 'T CD8 CM',
       'T CD8 naive', 'T CD4 CTL', 'T CD4 helper', 'T CD4 naive'],
      dtype='object')

### Check if data is raw

In [5]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis = 0))

In [6]:
X_is_raw(yoshida_pbmc)

True

In [9]:
pbmc_healthy = yoshida_pbmc[yoshida_pbmc.obs['COVID_status'].isin(['Healthy'])]
pbmc_healthy.obs['seed_labels'] = pbmc_healthy.obs['annotation_detailed'].copy()
pbmc_healthy.obs['donor'] = pbmc_healthy.obs['patient_id'].copy()
pbmc_healthy

  pbmc_healthy.obs['seed_labels'] = pbmc_healthy.obs['annotation_detailed'].copy()


AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'seed_labels', 'donor'
    var: 'name'

In [10]:
pbmc_healthy.obs['seed_labels'].cat.categories

Index(['HPC IFN stim', 'B n-sw mem IFN stim', 'B naive IFN stim',
       'Monocyte CD16 IFN stim', 'Monocyte CD14 IFN stim', 'NK IFN stim',
       'T CD8 CTL IFN stim', 'T CD4 naive IFN stim', 'RBC', 'Platelets',
       'Cycling', 'Baso/Eos', 'HPC', 'Plasmablasts', 'Plasma cells', 'B invar',
       'B sw mem', 'B n-sw mem', 'B naive', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'Monocyte CD16+C1', 'Monocyte CD16', 'Monocyte CD14 IL6',
       'Monocyte CD14', 'ILC', 'NK CD56', 'NK', 'NKT', 'MAIT', 'T reg',
       'T g/d', 'T CD8 CTL', 'T CD8 EMRA', 'T CD8 EM', 'T CD8 CM',
       'T CD8 naive', 'T CD4 CTL', 'T CD4 helper', 'T CD4 naive'],
      dtype='object')

In [11]:
pbmc_healthy.obs['cell_states'] = pbmc_healthy.obs['seed_labels'].copy()
pbmc_healthy.obs['cell_states'].cat.categories

Index(['HPC IFN stim', 'B n-sw mem IFN stim', 'B naive IFN stim',
       'Monocyte CD16 IFN stim', 'Monocyte CD14 IFN stim', 'NK IFN stim',
       'T CD8 CTL IFN stim', 'T CD4 naive IFN stim', 'RBC', 'Platelets',
       'Cycling', 'Baso/Eos', 'HPC', 'Plasmablasts', 'Plasma cells', 'B invar',
       'B sw mem', 'B n-sw mem', 'B naive', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'Monocyte CD16+C1', 'Monocyte CD16', 'Monocyte CD14 IL6',
       'Monocyte CD14', 'ILC', 'NK CD56', 'NK', 'NKT', 'MAIT', 'T reg',
       'T g/d', 'T CD8 CTL', 'T CD8 EMRA', 'T CD8 EM', 'T CD8 CM',
       'T CD8 naive', 'T CD4 CTL', 'T CD4 helper', 'T CD4 naive'],
      dtype='object')

### Clean unused `adata.obs`

In [12]:
pbmc_healthy

AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'seed_labels', 'donor', 'cell_states'
    var: 'name'

In [13]:
pbmc_healthy.obs = pbmc_healthy.obs[['orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states']]
pbmc_healthy

AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states'
    var: 'name'

### Export object

In [14]:
X_is_raw(pbmc_healthy)

True

In [15]:
pbmc_healthy.write('../data/meyer_nikolic_healthy_pbmc_raw.h5ad')