# Obtaining unified colormap for the cell types
this is tricky because we filter the cell types based on abundance in all datasets and create the mapping based on these

In [1]:
import pandas as pd
import numpy as np

# Cell Type proportions to filter out irrelevant cell types

In [2]:
df_counts = pd.read_csv('/home/degottardiraphael/MasterThesis/Benchmarking pipeline/CellTypeAssignment metrics/Celltypist/Celltypist_predicted_cell_types_counts.csv')
print(df_counts.head())

  Cell Type pred      SC    ENACT      B2C     Xseg
0           AT1   2966.0   2235.0   1521.0    734.0
1            AT2  7025.0  18365.0  18117.0  14836.0
2       B_memory  3076.0  16910.0  13919.0  11810.0
3        B_naive   319.0     93.0     47.0     38.0
4   B_plasma_IgA  1356.0   5188.0   3735.0   2781.0


In [3]:
# Create a cell type map by mapping each label to its prefix (before the first underscore)
cell_type_map = {
    label: label.split('_')[0] if '_' in label else label
    for label in df_counts['Cell Type pred'].unique()
}
print(cell_type_map)

{'AT1 ': 'AT1 ', 'AT2': 'AT2', 'B_memory': 'B', 'B_naive': 'B', 'B_plasma_IgA': 'B', 'B_plasma_IgG': 'B', 'B_plasmablast': 'B', 'Basal': 'Basal', 'CD4_EM/Effector': 'CD4', 'CD4_TRM': 'CD4', 'CD4_naive/CM': 'CD4', 'CD8_EM': 'CD8', 'CD8_EM/EMRA': 'CD8', 'CD8_TRM': 'CD8', 'CD8_TRM/EM': 'CD8', 'Ciliated': 'Ciliated', 'DC_1': 'DC', 'DC_2': 'DC', 'DC_activated': 'DC', 'DC_plasmacytoid': 'DC', 'Deuterosomal': 'Deuterosomal', 'Dividing_AT2': 'Dividing', 'Dividing_Basal': 'Dividing', 'Endothelia_Lymphatic': 'Endothelia', 'Endothelia_vascular_Cap_a': 'Endothelia', 'Endothelia_vascular_Cap_g': 'Endothelia', 'Endothelia_vascular_arterial_pulmonary': 'Endothelia', 'Endothelia_vascular_arterial_systemic': 'Endothelia', 'Endothelia_vascular_venous_pulmonary': 'Endothelia', 'Endothelia_vascular_venous_systemic': 'Endothelia', 'Erythrocyte': 'Erythrocyte', 'Fibro_adventitial': 'Fibro', 'Fibro_alveolar': 'Fibro', 'Fibro_immune_recruiting': 'Fibro', 'Fibro_myofibroblast': 'Fibro', 'Fibro_peribronchial': 

In [4]:
# Add a column for the broad/general cell type
df_counts['Broad_Cell_Type'] = df_counts['Cell Type pred'].map(cell_type_map).fillna(df_counts['Cell Type pred'])

# Aggregate the counts by the new broad/general cell types
aggregated_counts = df_counts.groupby('Broad_Cell_Type')[['SC', 'ENACT', 'B2C', 'Xseg']].sum()
aggregated_counts

Unnamed: 0_level_0,SC,ENACT,B2C,Xseg
Broad_Cell_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AT1,2966.0,2235.0,1521.0,734.0
AT2,7025.0,18365.0,18117.0,14836.0
B,6228.0,31411.0,25380.0,21246.0
Basal,525.0,4032.0,3680.0,2798.0
CD4,19746.0,20135.0,19823.0,17808.0
CD8,10052.0,1380.0,852.0,517.0
Ciliated,2151.0,3906.0,2726.0,2097.0
DC,3524.0,162.0,89.0,46.0
Deuterosomal,10.0,0.0,0.0,0.0
Dividing,589.0,82.0,67.0,25.0


In [5]:
normalized_aggregated = aggregated_counts.div(aggregated_counts.sum())
# Remove broad cell types with less than 1% in all methods (filtering)
normalized_aggregated = normalized_aggregated[(normalized_aggregated >= 0.01).any(axis=1)]
normalized_aggregated

Unnamed: 0_level_0,SC,ENACT,B2C,Xseg
Broad_Cell_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AT1,0.02452,0.011851,0.007846,0.003895
AT2,0.058077,0.097377,0.093456,0.078738
B,0.051488,0.166552,0.130922,0.112757
Basal,0.00434,0.021379,0.018983,0.01485
CD4,0.163243,0.106763,0.102256,0.094511
CD8,0.083101,0.007317,0.004395,0.002744
Ciliated,0.017783,0.020711,0.014062,0.011129
DC,0.029133,0.000859,0.000459,0.000244
Endothelia,0.023892,0.06927,0.060994,0.053003
Fibro,0.012103,0.052095,0.038977,0.027942


In [6]:
# Update the cell type map: only map to cell types present in normalized_aggregated, others to "other"
broad_types_set = set(normalized_aggregated.index)
cell_type_map_updated = {
    label: mapped if mapped in broad_types_set else "other"
    for label, mapped in cell_type_map.items()
}
print(cell_type_map_updated)

{'AT1 ': 'AT1 ', 'AT2': 'AT2', 'B_memory': 'B', 'B_naive': 'B', 'B_plasma_IgA': 'B', 'B_plasma_IgG': 'B', 'B_plasmablast': 'B', 'Basal': 'Basal', 'CD4_EM/Effector': 'CD4', 'CD4_TRM': 'CD4', 'CD4_naive/CM': 'CD4', 'CD8_EM': 'CD8', 'CD8_EM/EMRA': 'CD8', 'CD8_TRM': 'CD8', 'CD8_TRM/EM': 'CD8', 'Ciliated': 'Ciliated', 'DC_1': 'DC', 'DC_2': 'DC', 'DC_activated': 'DC', 'DC_plasmacytoid': 'DC', 'Deuterosomal': 'other', 'Dividing_AT2': 'other', 'Dividing_Basal': 'other', 'Endothelia_Lymphatic': 'Endothelia', 'Endothelia_vascular_Cap_a': 'Endothelia', 'Endothelia_vascular_Cap_g': 'Endothelia', 'Endothelia_vascular_arterial_pulmonary': 'Endothelia', 'Endothelia_vascular_arterial_systemic': 'Endothelia', 'Endothelia_vascular_venous_pulmonary': 'Endothelia', 'Endothelia_vascular_venous_systemic': 'Endothelia', 'Erythrocyte': 'other', 'Fibro_adventitial': 'Fibro', 'Fibro_alveolar': 'Fibro', 'Fibro_immune_recruiting': 'Fibro', 'Fibro_myofibroblast': 'Fibro', 'Fibro_peribronchial': 'Fibro', 'ILC': 'ot

In [7]:
import json

with open('cell_type_map_VpX.json', 'w') as f:
    json.dump(cell_type_map_updated, f, indent=2)