### Notebook to reformat the healthy PBMCs from [Yoshida 2022](https://www.nature.com/articles/s41586-021-04345-x) and ImYoo private data prior label transfer experiments.

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- v230808

### Import required modules

In [None]:
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

### Set up working environment

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis = 0))

### Read in `Yoshida M, 2022` Healthy data

In [None]:
yoshida_pbmc = sc.read_h5ad('../data/meyer_nikolic_covid_pbmc_raw.h5ad')
yoshida_pbmc

In [None]:
X_is_raw(yoshida_pbmc)

In [None]:
yoshida_pbmc.obs['COVID_status'].cat.categories

In [None]:
yoshida_pbmc.obs['annotation_detailed'].cat.categories

### Make covariates uniform

In [None]:
pbmc_healthy = yoshida_pbmc[yoshida_pbmc.obs['COVID_status'].isin(['Healthy'])]
pbmc_healthy.obs['seed_labels'] = pbmc_healthy.obs['annotation_detailed'].copy()
pbmc_healthy.obs['donor'] = pbmc_healthy.obs['patient_id'].copy()
pbmc_healthy.obs['sample'] = pbmc_healthy.obs['sample_id'].copy()
pbmc_healthy

In [None]:
pbmc_healthy.obs['seed_labels'].cat.categories

In [None]:
pbmc_healthy.obs['cell_states'] = pbmc_healthy.obs['seed_labels'].copy()
pbmc_healthy.obs['cell_states'].cat.categories

### Clean unused `adata.obs`

In [None]:
pbmc_healthy

In [None]:
pbmc_healthy.obs = pbmc_healthy.obs[['Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states']]
pbmc_healthy

### Read and process `ImYoo` private PBMC data

In [None]:
imyoo_pbmc = sc.read_h5ad('../data/imyoo_capillary_blood_samples_76535_pbmcs.h5ad')
imyoo_pbmc

In [None]:
imyoo_pbmc.var_names = imyoo_pbmc.var['name'].astype(str)
imyoo_pbmc.var_names = pd.Index(imyoo_pbmc.var_names)
imyoo_pbmc.var_names_make_unique()
imyoo_pbmc.var_names = imyoo_pbmc.var_names.astype(str)
imyoo_pbmc.var_names

In [None]:
X_is_raw(imyoo_pbmc)

In [None]:
imyoo_pbmc.obs['cell_type_level_3'].cat.categories

In [None]:
imyoo_pbmc.obs['seed_labels'] = imyoo_pbmc.obs['cell_type_level_3'].copy()
imyoo_pbmc.obs['donor'] = imyoo_pbmc.obs['Participant IDs'].copy()
imyoo_pbmc.obs['sample'] = imyoo_pbmc.obs['Sample IDs'].copy()
imyoo_pbmc

In [None]:
imyoo_pbmc.obs['cell_states'] = imyoo_pbmc.obs['seed_labels'].copy()
imyoo_pbmc.obs['cell_states'].cat.categories

In [None]:
imyoo_pbmc

In [None]:
imyoo_pbmc.obs = imyoo_pbmc.obs[['cell_type_level_1', 'cell_type_level_2', 'cell_type_level_3', 'cell_type_level_4', 'seed_labels', 'donor', 'sample', 'cell_states']]
imyoo_pbmc

### Make annotations uniform

In [None]:
imyoo_pbmc.obs['cell_states'].cat.categories

In [None]:
imyoo_pbmc.obs['cell_states'] = imyoo_pbmc.obs['cell_states'].cat.rename_categories(['pDC', 'CD4+T', 'gdT', 'cDC2',
       'CD14+CD16+Monocytes', 'cDC3', 'CD56+NK', 'tumorDC',
       'CLL-associated_B', 'class_memB',
       'muco_invarT', 'plasma_B',
       'CD56_dimNK', 'CD14+Monocytes', 'CD8+T',
       'CD16+Monocytes', 'naive_B', 'asDC', 'Mast',
       'IgM_memB', 'CD14+Monocytes-HSP_artifact',
       'Ageing_B', 'adaptive_NK'])
imyoo_pbmc.obs['cell_states'].cat.categories

In [None]:
pbmc_healthy.obs['cell_states'].cat.categories

In [None]:
pbmc_healthy.obs['cell_states'] = pbmc_healthy.obs['cell_states'].cat.rename_categories(['IFN-stim_HPC', 'IFN-stim_n-sw_memB', 'IFN-stim_naive_B',
       'IFN-stim_CD16+Monocyte', 'IFN-stim_CD14+Monocyte', 'IFN-stim_NK',
       'IFN-stim_ctlCD8+T', 'IFN-stim_naiveCD4+T', 'RBC', 'platelets',
       'cycling', 'Baso/Eos', 'HPC', 'plasmablasts', 'plasma_B', 'invarB',
       'sw_memB', 'n-sw_memB', 'naive_B', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'CD16+C1+Monocyte', 'CD16+Monocyte', ' CD14+IL6+Monocyte',
       'CD14+Monocyte', 'ILC', 'CD56+NK', 'NK', 'NKT', 'MAIT', 'regT',
       'gdT', 'ctlCD8+T', 'emraCD8+T', 'emCD8+T', 'cmCD8+T',
       'naiveCD8+T', 'ctlCD4+T  ', ' hCD4+T ', 'naiveCD4+T'])
pbmc_healthy.obs['cell_states'].cat.categories

### Merge objects

In [None]:
adata = pbmc_healthy.concatenate(imyoo_pbmc, batch_key = 'generator', batch_categories = ['YoshidaM_2022', 'ImYoo_2023'], join = 'inner')
adata

In [None]:
adata

In [None]:
adata.obs['seed_labels'] = adata.obs['seed_labels'].astype('category')
adata.obs['seed_labels'].cat.categories

In [None]:
adata.obs['cell_states'] = adata.obs['cell_states'].astype('category')
adata.obs['cell_states'].cat.categories

In [None]:
adata.obs['sample'] = adata.obs['sample'].astype(str).astype('category')
adata.obs['sample'].cat.categories

In [None]:
adata.obs['generator'] = adata.obs['generator'].astype('category')
adata.obs['generator'].cat.categories

In [None]:
adata.obs['donor'] = adata.obs['donor'].astype('category')
adata.obs['donor'].cat.categories

In [None]:
adata.obs['cell_states'].cat.categories

In [None]:
for col in adata.obs.columns:
    if pd.api.types.is_categorical_dtype(adata.obs[col]):
        adata.obs[col] = adata.obs[col].astype(str)

In [None]:
for col in adata.var.columns:
    if pd.api.types.is_categorical_dtype(adata.var[col]):
        adata.var[col] = adata.var[col].astype(str)


In [None]:
X_is_raw(adata)

In [None]:
adata.obs.head()

In [None]:
adata.var.head()

In [None]:
adata.write('../data/merged_pbmc_yoshida-imyoo_ctl230808_raw.h5ad')