In [28]:
import pandas as pd
import anndata as ad

In [29]:
adata = ad.read_h5ad('.data/Marburg_cell_states_locked_ctl240504.raw.h5ad')
adata

AnnData object with n_obs × n_vars = 97573 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'leiden', 'cell_compartment', '_scvi_batch', '_scvi_labels', 'C_scANVI', 'viral_counts', 'infected_status', 'seed_labels', 'batch-scANVI'
    var: 'mt', 'ribo'
    uns: 'cell_compartment_colors', 'cell_states_colors', 'disease_colors', 'group_colors', 'infection_colors'
    obsm: 'X_scANVI', 'X_umap'

In [30]:
batch_to_group_dict = {
    'CHE01_CTRL': 'copd_ctrl',
    'CHE01_IAV': 'copd_iav',
    'CHE02_CTRL': 'copd_ctrl',
    'CHE02_IAV': 'copd_iav',
    'CHE03_CTRL': 'copd_ctrl',
    'CHE03_IAV': 'copd_iav',
    'CSE03_CTRL': 'copd_ctrl',
    'CSE03_IAV': 'copd_iav',
    'CSE_06_CRTL': 'copd_ctrl',
    'CSE_06_IAV': 'copd_iav',
    'GER02_CTRL': 'healthy_ctrl',
    'GER02_IAV': 'healthy_iav',
    'GER12_CTRL': 'healthy_ctrl',
    'GER12_IAV':'healthy_iav',
    'GER14_CRTL': 'healthy_ctrl',
    'GER14_IAV': 'healthy_iav',
    'GnR08_CTRL': 'healthy_ctrl',
    'GnR08_IAV': 'healthy_iav',
    'GnR_07_CTRL': 'healthy_ctrl',
    'GnR_07_IAV': 'healthy_iav',
    'ST07_CSE_CTRL': 'copd_ctrl',
    'ST08_GnR_CTRL': 'healthy_ctrl',
    'ST09_CSE_IAV': 'copd_iav',
    'ST10_GnR_IAV': 'healthy_iav'
}

In [39]:
batch_to_donor = {
    'CHE01_CTRL': 'CHE-01',
    'CHE01_IAV': 'CHE-01',
    'CHE02_CTRL': 'CHE-02',
    'CHE02_IAV': 'CHE-02',
    'CHE03_CTRL': 'CHE-03',
    'CHE03_IAV': 'CHE-03',
    'CSE03_CTRL': 'CSE-03',
    'CSE03_IAV': 'CSE-03',
    'CSE_06_CRTL': 'CSE-06',
    'CSE_06_IAV': 'CSE-06',
    'GER02_CTRL': 'GER-02',
    'GER02_IAV': 'GER-02',
    'GER12_CTRL': 'GER-12',
    'GER12_IAV': 'GER-12',
    'GER14_CRTL': 'GER-14',
    'GER14_IAV': 'GER-14',
    'GnR08_CTRL': 'GNR-08',
    'GnR08_IAV': 'GNR-08',
    'GnR_07_CTRL': 'GNR-07',
    'GnR_07_IAV': 'GNR-07',
    'ST07_CSE_CTRL': 'CSE-07',
    'ST08_GnR_CTRL': 'GNR-06',
    'ST09_CSE_IAV': 'CSE-07',
    'ST10_GnR_IAV': 'GNR-06'
}

In [40]:
group_to_infection_dict = {
    'healthy_ctrl': 'CTRL',
    'healthy_iav': 'IAV',
    'copd_ctrl': 'CTRL',
    'copd_iav': 'IAV'
}

In [41]:
group_to_disease_dict = {
    'healthy_ctrl': 'healthy',
    'healthy_iav': 'healthy',
    'copd_ctrl': 'COPD',
    'copd_iav': 'COPD',
}

In [42]:
adata.obs['group'] = adata.obs['batch'].map(batch_to_group_dict)
adata.obs

Unnamed: 0,sex,age,ethnicity,PaCO2,donor,infection,disease,SMK,illumina_stimunr,bd_rhapsody,...,cell_states,leiden,cell_compartment,_scvi_batch,_scvi_labels,C_scANVI,viral_counts,infected_status,seed_labels,batch-scANVI
493638-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,OMG+Ciliated,MultiC_0,Ciliated,9,18,OMG+Ciliated,23.0,Uninfected,OMG+Ciliated,reference
816750-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,APOD+Ciliated,MultiC_1,Ciliated,9,0,APOD+Ciliated,20.0,Uninfected,APOD+Ciliated,reference
678000-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,19.0,Uninfected,TCN1+Club,reference
247802-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,16.0,Uninfected,TCN1+Club,reference
177433-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,17.0,Uninfected,TCN1+Club,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5327279-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,35.0,Uninfected,Unknown,query
10502482-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,12.0,Uninfected,Unknown,query
13574786-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,DHRS9+Club,Mixed_1,Club,6,36,DHRS9+Club,129.0,Uninfected,Unknown,query
3396934-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,MHCII+Club,Mixed_0,Club,6,36,MHCII+Club,22.0,Uninfected,Unknown,query


In [43]:
adata.obs['infection'] = adata.obs['group'].map(group_to_infection_dict)
adata.obs

Unnamed: 0,sex,age,ethnicity,PaCO2,donor,infection,disease,SMK,illumina_stimunr,bd_rhapsody,...,cell_states,leiden,cell_compartment,_scvi_batch,_scvi_labels,C_scANVI,viral_counts,infected_status,seed_labels,batch-scANVI
493638-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,OMG+Ciliated,MultiC_0,Ciliated,9,18,OMG+Ciliated,23.0,Uninfected,OMG+Ciliated,reference
816750-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,APOD+Ciliated,MultiC_1,Ciliated,9,0,APOD+Ciliated,20.0,Uninfected,APOD+Ciliated,reference
678000-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,19.0,Uninfected,TCN1+Club,reference
247802-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,16.0,Uninfected,TCN1+Club,reference
177433-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,17.0,Uninfected,TCN1+Club,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5327279-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,35.0,Uninfected,Unknown,query
10502482-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,12.0,Uninfected,Unknown,query
13574786-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,DHRS9+Club,Mixed_1,Club,6,36,DHRS9+Club,129.0,Uninfected,Unknown,query
3396934-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,MHCII+Club,Mixed_0,Club,6,36,MHCII+Club,22.0,Uninfected,Unknown,query


In [44]:
adata.obs['disease'] = adata.obs['group'].map(group_to_disease_dict)
adata.obs

Unnamed: 0,sex,age,ethnicity,PaCO2,donor,infection,disease,SMK,illumina_stimunr,bd_rhapsody,...,cell_states,leiden,cell_compartment,_scvi_batch,_scvi_labels,C_scANVI,viral_counts,infected_status,seed_labels,batch-scANVI
493638-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,OMG+Ciliated,MultiC_0,Ciliated,9,18,OMG+Ciliated,23.0,Uninfected,OMG+Ciliated,reference
816750-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,APOD+Ciliated,MultiC_1,Ciliated,9,0,APOD+Ciliated,20.0,Uninfected,APOD+Ciliated,reference
678000-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,19.0,Uninfected,TCN1+Club,reference
247802-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,16.0,Uninfected,TCN1+Club,reference
177433-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,GNR-06,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,17.0,Uninfected,TCN1+Club,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5327279-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,35.0,Uninfected,Unknown,query
10502482-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,12.0,Uninfected,Unknown,query
13574786-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,DHRS9+Club,Mixed_1,Club,6,36,DHRS9+Club,129.0,Uninfected,Unknown,query
3396934-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,MHCII+Club,Mixed_0,Club,6,36,MHCII+Club,22.0,Uninfected,Unknown,query


In [47]:
adata.obs['donor'] = adata.obs['batch'].map(batch_to_donor)
adata.obs

Unnamed: 0,sex,age,ethnicity,PaCO2,donor,infection,disease,SMK,illumina_stimunr,bd_rhapsody,...,cell_states,leiden,cell_compartment,_scvi_batch,_scvi_labels,C_scANVI,viral_counts,infected_status,seed_labels,batch-scANVI
493638-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,CSE-07,CTRL,COPD,7,21_0017,1,...,OMG+Ciliated,MultiC_0,Ciliated,9,18,OMG+Ciliated,23.0,Uninfected,OMG+Ciliated,reference
816750-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,CSE-07,CTRL,COPD,7,21_0017,1,...,APOD+Ciliated,MultiC_1,Ciliated,9,0,APOD+Ciliated,20.0,Uninfected,APOD+Ciliated,reference
678000-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,CSE-07,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,19.0,Uninfected,TCN1+Club,reference
247802-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,CSE-07,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,16.0,Uninfected,TCN1+Club,reference
177433-ST07_CSE_CTRL-V1,male,67,caucasian,88.8,CSE-07,CTRL,COPD,7,21_0017,1,...,TCN1+Club,MultiC_3,Club,9,26,TCN1+Club,17.0,Uninfected,TCN1+Club,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5327279-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,35.0,Uninfected,Unknown,query
10502482-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,SERPINE1+Basal,Mixed_3,Basal,6,36,SERPINE1+Basal,12.0,Uninfected,Unknown,query
13574786-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,DHRS9+Club,Mixed_1,Club,6,36,DHRS9+Club,129.0,Uninfected,Unknown,query
3396934-GER02_IAV-V6,male,53,caucasian,71.1,GER-02,IAV,healthy,6,21_0507,6,...,MHCII+Club,Mixed_0,Club,6,36,MHCII+Club,22.0,Uninfected,Unknown,query


In [48]:
adata.obs[['group', 'batch', 'donor']]

Unnamed: 0,group,batch,donor
493638-ST07_CSE_CTRL-V1,copd_ctrl,ST07_CSE_CTRL,CSE-07
816750-ST07_CSE_CTRL-V1,copd_ctrl,ST07_CSE_CTRL,CSE-07
678000-ST07_CSE_CTRL-V1,copd_ctrl,ST07_CSE_CTRL,CSE-07
247802-ST07_CSE_CTRL-V1,copd_ctrl,ST07_CSE_CTRL,CSE-07
177433-ST07_CSE_CTRL-V1,copd_ctrl,ST07_CSE_CTRL,CSE-07
...,...,...,...
5327279-GER02_IAV-V6,healthy_iav,GER02_IAV,GER-02
10502482-GER02_IAV-V6,healthy_iav,GER02_IAV,GER-02
13574786-GER02_IAV-V6,healthy_iav,GER02_IAV,GER-02
3396934-GER02_IAV-V6,healthy_iav,GER02_IAV,GER-02


In [49]:
adata.write_h5ad('.data/Marburg_cell_states_locked_ctl240504.raw.h5ad')