### Notebook to reformat the healthy PBMCs from [Yoshida 2022](https://www.nature.com/articles/s41586-021-04345-x) and ImYoo private data prior label transfer experiments.

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- v230808

### Import required modules

In [1]:
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.2
-----
PIL                 9.4.0
appnope             0.1.3
asttokens           NA
backcall            0.2.0
cffi                1.15.1
colorama            0.4.6
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
executing           1.2.0
h5py                3.8.0
igraph              0.10.4
importlib_resources NA
ipykernel           6.21.2
ipywidgets          8.0.7
jedi                0.18.2
joblib              1.2.0
kiwisolver          1.4.4
leidenalg           0.9.1
llvmlite            0.39.1
louvain             0.8.0
matplotlib          3.7.0
mpl_toolkits        NA
natsort             8.2.0
numba               0.56.4
numexpr             2.8.4
numpy               1.23.5
packaging           23.0
pandas              1.5.3
parso               0.8.3
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
platformdirs        3.0.

In [3]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis = 0))

### Read in `Yoshida M, 2022` Healthy data

In [4]:
yoshida_pbmc = sc.read_h5ad('../data/meyer_nikolic_covid_pbmc_raw.h5ad')
yoshida_pbmc

AnnData object with n_obs × n_vars = 422220 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'

In [5]:
X_is_raw(yoshida_pbmc)

True

In [6]:
yoshida_pbmc.obs['COVID_status'].cat.categories

Index(['COVID-19', 'Healthy', 'Post-COVID-19'], dtype='object')

In [7]:
yoshida_pbmc.obs['annotation_detailed'].cat.categories

Index(['HPC IFN stim', 'B n-sw mem IFN stim', 'B naive IFN stim',
       'Monocyte CD16 IFN stim', 'Monocyte CD14 IFN stim', 'NK IFN stim',
       'T CD8 CTL IFN stim', 'T CD4 naive IFN stim', 'RBC', 'Platelets',
       'Cycling', 'Baso/Eos', 'HPC', 'Plasmablasts', 'Plasma cells', 'B invar',
       'B sw mem', 'B n-sw mem', 'B naive', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'Monocyte CD16+C1', 'Monocyte CD16', 'Monocyte CD14 IL6',
       'Monocyte CD14', 'ILC', 'NK CD56', 'NK', 'NKT', 'MAIT', 'T reg',
       'T g/d', 'T CD8 CTL', 'T CD8 EMRA', 'T CD8 EM', 'T CD8 CM',
       'T CD8 naive', 'T CD4 CTL', 'T CD4 helper', 'T CD4 naive'],
      dtype='object')

### Make covariates uniform

In [8]:
pbmc_healthy = yoshida_pbmc[yoshida_pbmc.obs['COVID_status'].isin(['Healthy'])]
pbmc_healthy.obs['seed_labels'] = pbmc_healthy.obs['annotation_detailed'].copy()
pbmc_healthy.obs['donor'] = pbmc_healthy.obs['patient_id'].copy()
pbmc_healthy.obs['sample'] = pbmc_healthy.obs['sample_id'].copy()
pbmc_healthy

  pbmc_healthy.obs['seed_labels'] = pbmc_healthy.obs['annotation_detailed'].copy()


AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'seed_labels', 'donor', 'sample'
    var: 'name'

In [9]:
pbmc_healthy.obs['seed_labels'].cat.categories

Index(['HPC IFN stim', 'B n-sw mem IFN stim', 'B naive IFN stim',
       'Monocyte CD16 IFN stim', 'Monocyte CD14 IFN stim', 'NK IFN stim',
       'T CD8 CTL IFN stim', 'T CD4 naive IFN stim', 'RBC', 'Platelets',
       'Cycling', 'Baso/Eos', 'HPC', 'Plasmablasts', 'Plasma cells', 'B invar',
       'B sw mem', 'B n-sw mem', 'B naive', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'Monocyte CD16+C1', 'Monocyte CD16', 'Monocyte CD14 IL6',
       'Monocyte CD14', 'ILC', 'NK CD56', 'NK', 'NKT', 'MAIT', 'T reg',
       'T g/d', 'T CD8 CTL', 'T CD8 EMRA', 'T CD8 EM', 'T CD8 CM',
       'T CD8 naive', 'T CD4 CTL', 'T CD4 helper', 'T CD4 naive'],
      dtype='object')

In [10]:
pbmc_healthy.obs['cell_states'] = pbmc_healthy.obs['seed_labels'].copy()
pbmc_healthy.obs['cell_states'].cat.categories

Index(['HPC IFN stim', 'B n-sw mem IFN stim', 'B naive IFN stim',
       'Monocyte CD16 IFN stim', 'Monocyte CD14 IFN stim', 'NK IFN stim',
       'T CD8 CTL IFN stim', 'T CD4 naive IFN stim', 'RBC', 'Platelets',
       'Cycling', 'Baso/Eos', 'HPC', 'Plasmablasts', 'Plasma cells', 'B invar',
       'B sw mem', 'B n-sw mem', 'B naive', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'Monocyte CD16+C1', 'Monocyte CD16', 'Monocyte CD14 IL6',
       'Monocyte CD14', 'ILC', 'NK CD56', 'NK', 'NKT', 'MAIT', 'T reg',
       'T g/d', 'T CD8 CTL', 'T CD8 EMRA', 'T CD8 EM', 'T CD8 CM',
       'T CD8 naive', 'T CD4 CTL', 'T CD4 helper', 'T CD4 naive'],
      dtype='object')

### Clean unused `adata.obs`

In [11]:
pbmc_healthy

AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'seed_labels', 'donor', 'sample', 'cell_states'
    var: 'name'

In [12]:
pbmc_healthy.obs = pbmc_healthy.obs[['Sex', 'annotation_broad', 'annotation_detailed', 'sample', 'seed_labels', 'donor', 'cell_states']]
pbmc_healthy

AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'Sex', 'annotation_broad', 'annotation_detailed', 'sample', 'seed_labels', 'donor', 'cell_states'
    var: 'name'

In [13]:
pbmc_healthy.obs['sample'].cat.categories

Index(['AN1', 'AN2', 'AN3', 'AN5', 'AN6', 'AN7', 'AN9', 'AN11', 'AN12', 'AN13',
       'AN14', 'NP13', 'NP15', 'NP16', 'NP17', 'NP18', 'NP19', 'NP20', 'NP21',
       'NP22', 'NP23', 'NP24', 'NP26', 'NP27', 'NP28', 'NP30', 'NP31', 'NP32',
       'NP35', 'NP36', 'NP37', 'NP38', 'NP39', 'NP41', 'NP44'],
      dtype='object')

### Read and process `ImYoo` private PBMC data

In [14]:
imyoo_pbmc = sc.read_h5ad('../data/imyoo_capillary_blood_samples_76535_pbmcs.h5ad')
imyoo_pbmc

AnnData object with n_obs × n_vars = 76535 × 36601
    obs: 'barcode', 'Sample IDs', 'Participant IDs', 'Cell Barcoding Runs', 'Lane', 'extraction_protocol', 'sample_processing_delay_seconds', 'cell_barcoding_delay_days', 'cell_barcoding_protocol', 'run_lane_batch', 'cell_type_level_1', 'cell_type_level_2', 'cell_type_level_3', 'cell_type_level_4', 'c1', 'c2', 'c3', 'c4', 'original_sample_id'
    var: 'name', 'id'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'tsne', 'umap'
    obsm: 'X_mde', 'X_scvi', 'X_tsne', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [15]:
imyoo_pbmc.var_names = imyoo_pbmc.var['name'].astype(str)
imyoo_pbmc.var_names = pd.Index(imyoo_pbmc.var_names)
imyoo_pbmc.var_names_make_unique()
imyoo_pbmc.var_names = imyoo_pbmc.var_names.astype(str)
imyoo_pbmc.var_names

Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3',
       'AL627309.2', 'AL627309.5', 'AL627309.4', 'AP006222.2', 'AL732372.1',
       ...
       'AC133551.1', 'AC136612.1', 'AC136616.1', 'AC136616.3', 'AC136616.2',
       'AC141272.1', 'AC023491.2', 'AC007325.1', 'AC007325.4', 'AC007325.2'],
      dtype='object', name='name', length=36601)

In [16]:
X_is_raw(imyoo_pbmc)

True

In [17]:
imyoo_pbmc.obs['cell_type_level_3'].cat.categories

Index(['pDC', 'CD4 T Cells', 'Gamma-Delta T Cells', 'cDC2',
       'Intermediate Monocytes', 'cDC3', 'CD56 Bright NK Cells', 'tumorDC',
       'CLL-associated B Cells', 'Classical Memory B Cells',
       'Mucosal-Associated Invariant T Cells', 'Plasma B Cells',
       'CD56 Dim NK Cells', 'Classical Monocytes', 'CD8 T Cells',
       'Nonclassical Monocytes', 'Naive B Cells', 'asDC', 'Mast Cells',
       'IgM Memory B Cells', 'Classical Monocytes - HSP artifact',
       'Age-associated B Cells', 'Adaptive NK Cells'],
      dtype='object')

In [18]:
imyoo_pbmc.obs['seed_labels'] = imyoo_pbmc.obs['cell_type_level_3'].copy()
imyoo_pbmc.obs['donor'] = imyoo_pbmc.obs['Participant IDs'].copy()
imyoo_pbmc.obs['sample'] = imyoo_pbmc.obs['Sample IDs'].copy()
imyoo_pbmc

AnnData object with n_obs × n_vars = 76535 × 36601
    obs: 'barcode', 'Sample IDs', 'Participant IDs', 'Cell Barcoding Runs', 'Lane', 'extraction_protocol', 'sample_processing_delay_seconds', 'cell_barcoding_delay_days', 'cell_barcoding_protocol', 'run_lane_batch', 'cell_type_level_1', 'cell_type_level_2', 'cell_type_level_3', 'cell_type_level_4', 'c1', 'c2', 'c3', 'c4', 'original_sample_id', 'seed_labels', 'donor', 'sample'
    var: 'name', 'id'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'tsne', 'umap'
    obsm: 'X_mde', 'X_scvi', 'X_tsne', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [19]:
imyoo_pbmc.obs['cell_states'] = imyoo_pbmc.obs['seed_labels'].copy()
imyoo_pbmc.obs['cell_states'].cat.categories

Index(['pDC', 'CD4 T Cells', 'Gamma-Delta T Cells', 'cDC2',
       'Intermediate Monocytes', 'cDC3', 'CD56 Bright NK Cells', 'tumorDC',
       'CLL-associated B Cells', 'Classical Memory B Cells',
       'Mucosal-Associated Invariant T Cells', 'Plasma B Cells',
       'CD56 Dim NK Cells', 'Classical Monocytes', 'CD8 T Cells',
       'Nonclassical Monocytes', 'Naive B Cells', 'asDC', 'Mast Cells',
       'IgM Memory B Cells', 'Classical Monocytes - HSP artifact',
       'Age-associated B Cells', 'Adaptive NK Cells'],
      dtype='object')

In [20]:
imyoo_pbmc

AnnData object with n_obs × n_vars = 76535 × 36601
    obs: 'barcode', 'Sample IDs', 'Participant IDs', 'Cell Barcoding Runs', 'Lane', 'extraction_protocol', 'sample_processing_delay_seconds', 'cell_barcoding_delay_days', 'cell_barcoding_protocol', 'run_lane_batch', 'cell_type_level_1', 'cell_type_level_2', 'cell_type_level_3', 'cell_type_level_4', 'c1', 'c2', 'c3', 'c4', 'original_sample_id', 'seed_labels', 'donor', 'sample', 'cell_states'
    var: 'name', 'id'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'tsne', 'umap'
    obsm: 'X_mde', 'X_scvi', 'X_tsne', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [21]:
imyoo_pbmc.obs = imyoo_pbmc.obs[['cell_type_level_1', 'cell_type_level_2', 'cell_type_level_3', 'cell_type_level_4', 'seed_labels', 'donor', 'sample', 'cell_states']]
imyoo_pbmc

AnnData object with n_obs × n_vars = 76535 × 36601
    obs: 'cell_type_level_1', 'cell_type_level_2', 'cell_type_level_3', 'cell_type_level_4', 'seed_labels', 'donor', 'sample', 'cell_states'
    var: 'name', 'id'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'tsne', 'umap'
    obsm: 'X_mde', 'X_scvi', 'X_tsne', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

### Make annotations uniform

In [22]:
imyoo_pbmc.obs['cell_states'].cat.categories

Index(['pDC', 'CD4 T Cells', 'Gamma-Delta T Cells', 'cDC2',
       'Intermediate Monocytes', 'cDC3', 'CD56 Bright NK Cells', 'tumorDC',
       'CLL-associated B Cells', 'Classical Memory B Cells',
       'Mucosal-Associated Invariant T Cells', 'Plasma B Cells',
       'CD56 Dim NK Cells', 'Classical Monocytes', 'CD8 T Cells',
       'Nonclassical Monocytes', 'Naive B Cells', 'asDC', 'Mast Cells',
       'IgM Memory B Cells', 'Classical Monocytes - HSP artifact',
       'Age-associated B Cells', 'Adaptive NK Cells'],
      dtype='object')

In [23]:
imyoo_pbmc.obs['cell_states'] = imyoo_pbmc.obs['cell_states'].cat.rename_categories(['pDC', 'CD4+T', 'gdT', 'cDC2',
       'CD14+CD16+Monocytes', 'cDC3', 'CD56+NK', 'tumorDC',
       'CLL-associated_B', 'class_memB',
       'muco_invarT', 'plasma_B',
       'CD56_dimNK', 'CD14+Monocytes', 'CD8+T',
       'CD16+Monocytes', 'naive_B', 'asDC', 'Mast',
       'IgM_memB', 'CD14+Monocytes-HSP_artifact',
       'Ageing_B', 'adaptive_NK'])
imyoo_pbmc.obs['cell_states'].cat.categories

Index(['pDC', 'CD4+T', 'gdT', 'cDC2', 'CD14+CD16+Monocytes', 'cDC3', 'CD56+NK',
       'tumorDC', 'CLL-associated_B', 'class_memB', 'muco_invarT', 'plasma_B',
       'CD56_dimNK', 'CD14+Monocytes', 'CD8+T', 'CD16+Monocytes', 'naive_B',
       'asDC', 'Mast', 'IgM_memB', 'CD14+Monocytes-HSP_artifact', 'Ageing_B',
       'adaptive_NK'],
      dtype='object')

In [24]:
pbmc_healthy.obs['cell_states'].cat.categories

Index(['HPC IFN stim', 'B n-sw mem IFN stim', 'B naive IFN stim',
       'Monocyte CD16 IFN stim', 'Monocyte CD14 IFN stim', 'NK IFN stim',
       'T CD8 CTL IFN stim', 'T CD4 naive IFN stim', 'RBC', 'Platelets',
       'Cycling', 'Baso/Eos', 'HPC', 'Plasmablasts', 'Plasma cells', 'B invar',
       'B sw mem', 'B n-sw mem', 'B naive', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'Monocyte CD16+C1', 'Monocyte CD16', 'Monocyte CD14 IL6',
       'Monocyte CD14', 'ILC', 'NK CD56', 'NK', 'NKT', 'MAIT', 'T reg',
       'T g/d', 'T CD8 CTL', 'T CD8 EMRA', 'T CD8 EM', 'T CD8 CM',
       'T CD8 naive', 'T CD4 CTL', 'T CD4 helper', 'T CD4 naive'],
      dtype='object')

In [25]:
pbmc_healthy.obs['cell_states'] = pbmc_healthy.obs['cell_states'].cat.rename_categories(['IFN-stim_HPC', 'IFN-stim_n-sw_memB', 'IFN-stim_naive_B',
       'IFN-stim_CD16+Monocyte', 'IFN-stim_CD14+Monocyte', 'IFN-stim_NK',
       'IFN-stim_ctlCD8+T', 'IFN-stim_naiveCD4+T', 'RBC', 'platelets',
       'cycling', 'Baso/Eos', 'HPC', 'plasmablasts', 'plasma_B', 'invarB',
       'sw_memB', 'n-sw_memB', 'naive_B', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'CD16+C1+Monocyte', 'CD16+Monocyte', ' CD14+IL6+Monocyte',
       'CD14+Monocyte', 'ILC', 'CD56+NK', 'NK', 'NKT', 'MAIT', 'regT',
       'gdT', 'ctlCD8+T', 'emraCD8+T', 'emCD8+T', 'cmCD8+T',
       'naiveCD8+T', 'ctlCD4+T  ', ' hCD4+T ', 'naiveCD4+T'])
pbmc_healthy.obs['cell_states'].cat.categories

Index(['IFN-stim_HPC', 'IFN-stim_n-sw_memB', 'IFN-stim_naive_B',
       'IFN-stim_CD16+Monocyte', 'IFN-stim_CD14+Monocyte', 'IFN-stim_NK',
       'IFN-stim_ctlCD8+T', 'IFN-stim_naiveCD4+T', 'RBC', 'platelets',
       'cycling', 'Baso/Eos', 'HPC', 'plasmablasts', 'plasma_B', 'invarB',
       'sw_memB', 'n-sw_memB', 'naive_B', 'cDC2', 'cDC1', 'AS-DC', 'pDC',
       'CD16+C1+Monocyte', 'CD16+Monocyte', ' CD14+IL6+Monocyte',
       'CD14+Monocyte', 'ILC', 'CD56+NK', 'NK', 'NKT', 'MAIT', 'regT', 'gdT',
       'ctlCD8+T', 'emraCD8+T', 'emCD8+T', 'cmCD8+T', 'naiveCD8+T',
       'ctlCD4+T  ', ' hCD4+T ', 'naiveCD4+T'],
      dtype='object')

### Merge objects

In [26]:
adata = pbmc_healthy.concatenate(imyoo_pbmc, batch_key = 'generator', batch_categories = ['YoshidaM_2022', 'ImYoo_2023'], join = 'inner')
adata

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 250219 × 31908
    obs: 'Sex', 'annotation_broad', 'annotation_detailed', 'sample', 'seed_labels', 'donor', 'cell_states', 'cell_type_level_1', 'cell_type_level_2', 'cell_type_level_3', 'cell_type_level_4', 'generator'
    var: 'name-ImYoo_2023', 'id-ImYoo_2023', 'name-YoshidaM_2022'

In [28]:
del(adata.obs['Sex'])
del(adata.obs['annotation_detailed'])
del(adata.obs['annotation_broad'])
del(adata.obs['cell_type_level_1'])
del(adata.obs['cell_type_level_2'])
del(adata.obs['cell_type_level_3'])
del(adata.obs['cell_type_level_4'])

In [29]:
adata

AnnData object with n_obs × n_vars = 250219 × 31908
    obs: 'sample', 'seed_labels', 'donor', 'cell_states', 'generator'
    var: 'name-ImYoo_2023', 'id-ImYoo_2023', 'name-YoshidaM_2022'

In [31]:
adata.obs['cell_states'] = adata.obs['cell_states'].astype('category')
adata.obs['cell_states'].cat.categories

Index([' CD14+IL6+Monocyte', ' hCD4+T ', 'AS-DC', 'Ageing_B', 'Baso/Eos',
       'CD14+CD16+Monocytes', 'CD14+Monocyte', 'CD14+Monocytes',
       'CD14+Monocytes-HSP_artifact', 'CD16+C1+Monocyte', 'CD16+Monocyte',
       'CD16+Monocytes', 'CD4+T', 'CD56+NK', 'CD56_dimNK', 'CD8+T',
       'CLL-associated_B', 'HPC', 'IFN-stim_CD14+Monocyte',
       'IFN-stim_CD16+Monocyte', 'IFN-stim_HPC', 'IFN-stim_NK',
       'IFN-stim_ctlCD8+T', 'IFN-stim_n-sw_memB', 'IFN-stim_naiveCD4+T',
       'IFN-stim_naive_B', 'ILC', 'IgM_memB', 'MAIT', 'Mast', 'NK', 'NKT',
       'RBC', 'adaptive_NK', 'asDC', 'cDC1', 'cDC2', 'cDC3', 'class_memB',
       'cmCD8+T', 'ctlCD4+T  ', 'ctlCD8+T', 'cycling', 'emCD8+T', 'emraCD8+T',
       'gdT', 'invarB', 'muco_invarT', 'n-sw_memB', 'naiveCD4+T', 'naiveCD8+T',
       'naive_B', 'pDC', 'plasma_B', 'plasmablasts', 'platelets', 'regT',
       'sw_memB', 'tumorDC'],
      dtype='object')

In [33]:
adata.obs['seed_labels'] = adata.obs['cell_states'].copy()
adata.obs['seed_labels'] = adata.obs['seed_labels'].astype('category')
adata.obs['seed_labels'].cat.categories

Index([' CD14+IL6+Monocyte', ' hCD4+T ', 'AS-DC', 'Ageing_B', 'Baso/Eos',
       'CD14+CD16+Monocytes', 'CD14+Monocyte', 'CD14+Monocytes',
       'CD14+Monocytes-HSP_artifact', 'CD16+C1+Monocyte', 'CD16+Monocyte',
       'CD16+Monocytes', 'CD4+T', 'CD56+NK', 'CD56_dimNK', 'CD8+T',
       'CLL-associated_B', 'HPC', 'IFN-stim_CD14+Monocyte',
       'IFN-stim_CD16+Monocyte', 'IFN-stim_HPC', 'IFN-stim_NK',
       'IFN-stim_ctlCD8+T', 'IFN-stim_n-sw_memB', 'IFN-stim_naiveCD4+T',
       'IFN-stim_naive_B', 'ILC', 'IgM_memB', 'MAIT', 'Mast', 'NK', 'NKT',
       'RBC', 'adaptive_NK', 'asDC', 'cDC1', 'cDC2', 'cDC3', 'class_memB',
       'cmCD8+T', 'ctlCD4+T  ', 'ctlCD8+T', 'cycling', 'emCD8+T', 'emraCD8+T',
       'gdT', 'invarB', 'muco_invarT', 'n-sw_memB', 'naiveCD4+T', 'naiveCD8+T',
       'naive_B', 'pDC', 'plasma_B', 'plasmablasts', 'platelets', 'regT',
       'sw_memB', 'tumorDC'],
      dtype='object')

In [34]:
adata.obs['sample'] = adata.obs['sample'].astype(str).astype('category')
adata.obs['sample'].cat.categories

Index(['1004', '1005', '1071', '1072', '1170', '1171', '1176', '1177', '1382',
       '1385', '1394', '1395', '20', '329', '424', '892', '894', '909', '911',
       '95', '952', '953', '958', '959', '970', '971', '977', '978', 'AN1',
       'AN11', 'AN12', 'AN13', 'AN14', 'AN2', 'AN3', 'AN5', 'AN6', 'AN7',
       'AN9', 'NP13', 'NP15', 'NP16', 'NP17', 'NP18', 'NP19', 'NP20', 'NP21',
       'NP22', 'NP23', 'NP24', 'NP26', 'NP27', 'NP28', 'NP30', 'NP31', 'NP32',
       'NP35', 'NP36', 'NP37', 'NP38', 'NP39', 'NP41', 'NP44'],
      dtype='object')

In [35]:
adata.obs['generator'] = adata.obs['generator'].astype('category')
adata.obs['generator'].cat.categories

Index(['YoshidaM_2022', 'ImYoo_2023'], dtype='object')

In [36]:
adata.obs['donor'] = adata.obs['donor'].astype('category')
adata.obs['donor'].cat.categories

Index([     2,      3,     51,  'AN1', 'AN11', 'AN12', 'AN13', 'AN14',  'AN2',
        'AN3',  'AN5',  'AN6',  'AN7',  'AN9', 'NP13', 'NP15', 'NP16', 'NP17',
       'NP18', 'NP19', 'NP20', 'NP21', 'NP22', 'NP23', 'NP24', 'NP26', 'NP27',
       'NP28', 'NP30', 'NP31', 'NP32', 'NP35', 'NP36', 'NP37', 'NP38', 'NP39',
       'NP41', 'NP44'],
      dtype='object')

In [37]:
adata.obs['cell_states'].cat.categories

Index([' CD14+IL6+Monocyte', ' hCD4+T ', 'AS-DC', 'Ageing_B', 'Baso/Eos',
       'CD14+CD16+Monocytes', 'CD14+Monocyte', 'CD14+Monocytes',
       'CD14+Monocytes-HSP_artifact', 'CD16+C1+Monocyte', 'CD16+Monocyte',
       'CD16+Monocytes', 'CD4+T', 'CD56+NK', 'CD56_dimNK', 'CD8+T',
       'CLL-associated_B', 'HPC', 'IFN-stim_CD14+Monocyte',
       'IFN-stim_CD16+Monocyte', 'IFN-stim_HPC', 'IFN-stim_NK',
       'IFN-stim_ctlCD8+T', 'IFN-stim_n-sw_memB', 'IFN-stim_naiveCD4+T',
       'IFN-stim_naive_B', 'ILC', 'IgM_memB', 'MAIT', 'Mast', 'NK', 'NKT',
       'RBC', 'adaptive_NK', 'asDC', 'cDC1', 'cDC2', 'cDC3', 'class_memB',
       'cmCD8+T', 'ctlCD4+T  ', 'ctlCD8+T', 'cycling', 'emCD8+T', 'emraCD8+T',
       'gdT', 'invarB', 'muco_invarT', 'n-sw_memB', 'naiveCD4+T', 'naiveCD8+T',
       'naive_B', 'pDC', 'plasma_B', 'plasmablasts', 'platelets', 'regT',
       'sw_memB', 'tumorDC'],
      dtype='object')

In [38]:
for col in adata.obs.columns:
    if pd.api.types.is_categorical_dtype(adata.obs[col]):
        adata.obs[col] = adata.obs[col].astype(str)

In [39]:
for col in adata.var.columns:
    if pd.api.types.is_categorical_dtype(adata.var[col]):
        adata.var[col] = adata.var[col].astype(str)


In [40]:
X_is_raw(adata)

True

In [41]:
adata.obs.head()

Unnamed: 0,sample,seed_labels,donor,cell_states,generator
CV001_KM10202384-CV001_KM10202394_AAACCTGAGGCAGGTT-1-YoshidaM_2022,AN5,CD14+Monocyte,AN5,CD14+Monocyte,YoshidaM_2022
CV001_KM10202384-CV001_KM10202394_AAACCTGAGTGTCCCG-1-YoshidaM_2022,AN5,hCD4+T,AN5,hCD4+T,YoshidaM_2022
CV001_KM10202384-CV001_KM10202394_AAACCTGCAGATGGGT-1-YoshidaM_2022,AN3,hCD4+T,AN3,hCD4+T,YoshidaM_2022
CV001_KM10202384-CV001_KM10202394_AAACCTGGTATAGTAG-1-YoshidaM_2022,AN5,naiveCD8+T,AN5,naiveCD8+T,YoshidaM_2022
CV001_KM10202384-CV001_KM10202394_AAACCTGGTGTGCGTC-1-YoshidaM_2022,AN5,naiveCD4+T,AN5,naiveCD4+T,YoshidaM_2022


In [42]:
adata.var.head()

Unnamed: 0,name-ImYoo_2023,id-ImYoo_2023,name-YoshidaM_2022
MIR1302-2HG,MIR1302-2HG,ENSG00000243485,MIR1302-2HG
FAM138A,FAM138A,ENSG00000237613,FAM138A
OR4F5,OR4F5,ENSG00000186092,OR4F5
AL627309.1,AL627309.1,ENSG00000238009,AL627309.1
AL627309.3,AL627309.3,ENSG00000239945,AL627309.3


In [43]:
adata.write('../data/merged_pbmc_yoshida-imyoo_ctl230808_raw.h5ad')