In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import generate_split_dir


#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300

overwriteFigures = True
overwriteData = True

**Loading annotation pkl**

In [2]:
step = "Step5"

In [3]:
annotation_col_name = 'annotation_step_5'

In [4]:
file_paths = glob.glob(str(here(f"02_fromCellLineages_to_CellTypes/{step}/*/results/INFLAMMATION_cluster_res*_annotated.pkl")))
# Load and concatenate all files into a single column DataFrame

In [5]:
for file_path in file_paths: 
    print(file_path)
    clusterAnnotDF = pd.read_pickle(file_path)
    print(set(clusterAnnotDF[annotation_col_name]))

/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_fromCellLineages_to_CellTypes/Step5/UTC/results/INFLAMMATION_cluster_res0.5_annotated.pkl
{'MAIT_17', 'LowQuality_cells', 'gdT_V1', 'gdT_V2_Vγ9', 'MAIT'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_fromCellLineages_to_CellTypes/Step5/T_CD4_NonNaive/results/INFLAMMATION_cluster_res0.75_annotated.pkl
{'Th1', 'T_CD4_EM', 'Tregs', 'LowQuality_cells', 'T_CD4_eff', 'T_CD4_CM', 'T_CD4_CM_ribo', 'Tregs_activated', 'T_CD4_EMRA', 'Th2', 'Th0'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_fromCellLineages_to_CellTypes/Step5/T_CD8_NonNaive/results/INFLAMMATION_cluster_res1_annotated.pkl
{'T_CD8_activated', 'T_CD8_CM_stem', 'T_CD8_EM_CX3CR1high', 'LowQuality_cells', 'T_CD8_arrested', 'T_CD8_Mem_cytotoxic', 'T_CD8_EM_CX3CR1int', 'T_CD8_IFNresponse', 'T_CD8_eff_HOBIT', 'T_CD8_CM'}


In [6]:
clusterAnnotDF = pd.concat([pd.read_pickle(file_path) for file_path in file_paths], axis=0)

In [7]:
print(clusterAnnotDF.head())

                                       annotation_step_5
cellID                                                  
SCGT00_L051_I56.3P_T0_ATGGGTTCAGGAAGTC        gdT_V2_Vγ9
SCGT00_L051_I52.3P_T0_GCCATGGCATCGATGT        gdT_V2_Vγ9
SCGT00_L051_I52.3P_T0_GGGTATTTCGCGTCGA        gdT_V2_Vγ9
SCGT00_L035_I52.3P_T0_AGAAGCGTCCACAAGT        gdT_V2_Vγ9
SCGT00_L035_I52.3P_T0_ATAGACCCATTCTCCG  LowQuality_cells


In [8]:
set(clusterAnnotDF[annotation_col_name])

{'LowQuality_cells',
 'MAIT',
 'MAIT_17',
 'T_CD4_CM',
 'T_CD4_CM_ribo',
 'T_CD4_EM',
 'T_CD4_EMRA',
 'T_CD4_eff',
 'T_CD8_CM',
 'T_CD8_CM_stem',
 'T_CD8_EM_CX3CR1high',
 'T_CD8_EM_CX3CR1int',
 'T_CD8_IFNresponse',
 'T_CD8_Mem_cytotoxic',
 'T_CD8_activated',
 'T_CD8_arrested',
 'T_CD8_eff_HOBIT',
 'Th0',
 'Th1',
 'Th2',
 'Tregs',
 'Tregs_activated',
 'gdT_V1',
 'gdT_V2_Vγ9'}

In [9]:
print(clusterAnnotDF.shape)

(1549710, 1)


In [10]:
clusterAnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))

**Updating master annotation DF**

In [11]:
master_AnnotDFpath = str(here("02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))
master_AnnotDF = pd.read_pickle(master_AnnotDFpath)

In [12]:
last_column_name = master_AnnotDF.columns[-1]
master_AnnotDF[annotation_col_name] = master_AnnotDF[last_column_name]
master_AnnotDF[annotation_col_name].update(clusterAnnotDF[annotation_col_name])

In [13]:
print(master_AnnotDF.head())

                                       annotation_step_1 annotation_step_2  \
cellID                                                                       
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC               pDC   
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC               pDC   
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC               pDC   
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC               pDC   
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC               pDC   

                                       annotation_step_3 annotation_step_4  \
cellID                                                                       
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC               pDC   
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC               pDC   
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC               pDC   
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC       

In [14]:
print(master_AnnotDF.shape)

(4918140, 5)


In [15]:
set(master_AnnotDF[annotation_col_name])

{'B_IFNresponder',
 'B_Memory_ITGAX',
 'B_Memory_switched',
 'B_Memory_unswitched',
 'B_Naive',
 'B_Naive_activated',
 'B_Progenitors',
 'B_Transitional',
 'DC4',
 'DC5',
 'DC_CCR7',
 'DC_Proliferative',
 'Doublets',
 'HSC_LMP',
 'HSC_MEMP',
 'HSC_MMP',
 'LowQuality_cells',
 'MAIT',
 'MAIT_17',
 'Mono_IFNresponse',
 'Mono_classical',
 'Mono_inflammatory',
 'Mono_nonClassical',
 'Mono_regulatory',
 'NK_CD16high',
 'NK_CD56dimCD16',
 'NK_CD56high',
 'NK_IFN1response',
 'NK_Proliferative',
 'NK_adaptive',
 'NK_lowRibocontent',
 'Plasma_IGHA',
 'Plasma_IGHG',
 'Plasma_Proliferative',
 'Plasma_XBP1',
 'Platelets',
 'RBC',
 'T_CD4_CM',
 'T_CD4_CM_ribo',
 'T_CD4_EM',
 'T_CD4_EMRA',
 'T_CD4_Naive',
 'T_CD4_eff',
 'T_CD8_CM',
 'T_CD8_CM_stem',
 'T_CD8_EM_CX3CR1high',
 'T_CD8_EM_CX3CR1int',
 'T_CD8_IFNresponse',
 'T_CD8_Mem_cytotoxic',
 'T_CD8_Naive',
 'T_CD8_activated',
 'T_CD8_arrested',
 'T_CD8_eff_HOBIT',
 'T_Progenitors',
 'T_Proliferative',
 'Th0',
 'Th1',
 'Th2',
 'Tregs',
 'Tregs_activat

# Split Lineages

In [16]:
annotation_col_name = 'annotation_step_5'
not_interesting_categories = ["LowQuality_cells", "Doublets",  'Platelets', "RBC"]
finished_categories = [
'B_IFNresponder',
 'B_Memory_ITGAX',
 'B_Memory_switched',
 'B_Memory_unswitched',
 'B_Naive',
 'B_Naive_activated',
 'B_Progenitors',
 'B_Transitional',
 'DC4',
 'DC5',
 'DC_CCR7',
 'DC_Proliferative',
 'HSC_LMP',
 'HSC_MEMP',
 'HSC_MMP',
 'MAIT',
 'MAIT_17',
 'Mono_IFNresponse',
 'Mono_classical',
 'Mono_inflammatory',
 'Mono_nonClassical',
 'Mono_regulatory',
 'NK_CD16high',
 'NK_CD56dimCD16',
 'NK_CD56high',
 'NK_IFN1response',
 'NK_Proliferative',
 'NK_adaptive',
 'NK_lowRibocontent',
 'Plasma_IGHA',
 'Plasma_IGHG',
 'Plasma_XBP1',
 'Plasma_Proliferative',
 'T_CD4_CM',
 'T_CD4_CM_ribo',
 'T_CD4_EM',
 'T_CD4_EMRA',
 'T_CD4_Naive',
 'T_CD4_eff',
 'T_CD8_CM',
 'T_CD8_CM_stem',
 'T_CD8_EM_CX3CR1high',
 'T_CD8_EM_CX3CR1int',
 'T_CD8_IFNresponse',
 'T_CD8_Mem_cytotoxic',
 'T_CD8_Naive',
 'T_CD8_activated',
 'T_CD8_arrested',
 'T_CD8_eff_HOBIT',
 'T_Progenitors',
 'T_Proliferative',
 'Th0',
 'Th1',
 'Th2',
 'Tregs',
 'Tregs_activated',
 'cDC1',
 'cDC2',
 'cDC3',
 'gdT_V1',
 'gdT_V2_Vγ9',
 'pDC'
]

In [17]:
cell_counts = master_AnnotDF.groupby(annotation_col_name).size().sort_values(ascending=False)
print(cell_counts)

annotation_step_5
T_CD4_Naive         792955
Mono_classical      485760
LowQuality_cells    241452
Doublets            240766
T_CD8_Naive         213776
                     ...  
DC5                   1697
B_Progenitors          987
T_Progenitors          868
HSC_MEMP               392
DC_Proliferative       200
Length: 66, dtype: int64


In [18]:
total_cells_to_remove = cell_counts.loc[not_interesting_categories].sum()
print(f"We are removing a total of {total_cells_to_remove} cells")
for category in not_interesting_categories:
    cells_removed = cell_counts.loc[category]
    print(f"{category}: {cells_removed} cells removed")

We are removing a total of 638788 cells
LowQuality_cells: 241452 cells removed
Doublets: 240766 cells removed
Platelets: 154085 cells removed
RBC: 2485 cells removed


In [19]:
total_cells_finished = cell_counts.loc[finished_categories].sum()
print(f"We have finished a total of {total_cells_finished} cells")
for category in finished_categories:
    cells_finished = cell_counts.loc[category]
    print(f"{category}: {cells_finished} cells finished")

We have finished a total of 4279352 cells
B_IFNresponder: 9403 cells finished
B_Memory_ITGAX: 16177 cells finished
B_Memory_switched: 100092 cells finished
B_Memory_unswitched: 11409 cells finished
B_Naive: 130137 cells finished
B_Naive_activated: 15055 cells finished
B_Progenitors: 987 cells finished
B_Transitional: 74601 cells finished
DC4: 5729 cells finished
DC5: 1697 cells finished
DC_CCR7: 8023 cells finished
DC_Proliferative: 200 cells finished
HSC_LMP: 2784 cells finished
HSC_MEMP: 392 cells finished
HSC_MMP: 2002 cells finished
MAIT: 36611 cells finished
MAIT_17: 6005 cells finished
Mono_IFNresponse: 177845 cells finished
Mono_classical: 485760 cells finished
Mono_inflammatory: 118967 cells finished
Mono_nonClassical: 140172 cells finished
Mono_regulatory: 126405 cells finished
NK_CD16high: 23214 cells finished
NK_CD56dimCD16: 183200 cells finished
NK_CD56high: 32550 cells finished
NK_IFN1response: 14267 cells finished
NK_Proliferative: 13494 cells finished
NK_adaptive: 35628 

In [20]:
next_step_categories = cell_counts.index.difference(set(finished_categories).union(set(not_interesting_categories)))
total_cells_of_interest = cell_counts.loc[next_step_categories].sum()
print(f"We are still missing a total of {total_cells_of_interest} cells")
for category in next_step_categories:
    cells_missing = cell_counts.loc[category]
    print(f"{category}: {cells_missing} cells missing")

We are still missing a total of 0 cells


In [21]:
master_AnnotDF[annotation_col_name].unique()

array(['pDC', 'Doublets', 'LowQuality_cells', 'Platelets',
       'DC_Proliferative', 'DC5', 'cDC2', 'cDC3', 'DC_CCR7',
       'Mono_regulatory', 'Mono_classical', 'Mono_nonClassical',
       'Mono_inflammatory', 'Mono_IFNresponse', 'cDC1', 'DC4', 'RBC',
       'T_CD8_eff_HOBIT', 'T_CD4_Naive', 'T_CD4_CM', 'Th0', 'Tregs',
       'T_CD8_activated', 'T_CD8_CM', 'T_CD8_Mem_cytotoxic',
       'T_CD8_IFNresponse', 'T_CD8_EM_CX3CR1high', 'NK_CD56dimCD16',
       'T_CD8_EM_CX3CR1int', 'T_CD4_EM', 'T_CD8_Naive', 'NK_IFN1response',
       'NK_lowRibocontent', 'NK_adaptive', 'NK_CD56high', 'Th1',
       'Tregs_activated', 'T_CD4_EMRA', 'NK_Proliferative',
       'T_CD4_CM_ribo', 'T_Proliferative', 'NK_CD16high', 'T_CD4_eff',
       'T_CD8_arrested', 'Th2', 'T_CD8_CM_stem', 'MAIT_17', 'gdT_V2_Vγ9',
       'gdT_V1', 'MAIT', 'T_Progenitors', 'Plasma_IGHA', 'Plasma_IGHG',
       'Plasma_Proliferative', 'Plasma_XBP1', 'B_Memory_switched',
       'B_Naive', 'HSC_LMP', 'HSC_MMP', 'B_Progenitors', 'HSC_

In [22]:
master_AnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))

## Add Level1 annotation

In [23]:
master_AnnotDF["Level1"] = master_AnnotDF["annotation_step_5"].astype('str')
master_AnnotDF["Level1"].unique()

array(['pDC', 'Doublets', 'LowQuality_cells', 'Platelets',
       'DC_Proliferative', 'DC5', 'cDC2', 'cDC3', 'DC_CCR7',
       'Mono_regulatory', 'Mono_classical', 'Mono_nonClassical',
       'Mono_inflammatory', 'Mono_IFNresponse', 'cDC1', 'DC4', 'RBC',
       'T_CD8_eff_HOBIT', 'T_CD4_Naive', 'T_CD4_CM', 'Th0', 'Tregs',
       'T_CD8_activated', 'T_CD8_CM', 'T_CD8_Mem_cytotoxic',
       'T_CD8_IFNresponse', 'T_CD8_EM_CX3CR1high', 'NK_CD56dimCD16',
       'T_CD8_EM_CX3CR1int', 'T_CD4_EM', 'T_CD8_Naive', 'NK_IFN1response',
       'NK_lowRibocontent', 'NK_adaptive', 'NK_CD56high', 'Th1',
       'Tregs_activated', 'T_CD4_EMRA', 'NK_Proliferative',
       'T_CD4_CM_ribo', 'T_Proliferative', 'NK_CD16high', 'T_CD4_eff',
       'T_CD8_arrested', 'Th2', 'T_CD8_CM_stem', 'MAIT_17', 'gdT_V2_Vγ9',
       'gdT_V1', 'MAIT', 'T_Progenitors', 'Plasma_IGHA', 'Plasma_IGHG',
       'Plasma_Proliferative', 'Plasma_XBP1', 'B_Memory_switched',
       'B_Naive', 'HSC_LMP', 'HSC_MMP', 'B_Progenitors', 'HSC_

In [24]:
# Level1 dictionary

annotation_level1_unification = {
    "B": ['B_IFNresponder','B_Memory_ITGAX','B_Memory_switched','B_Memory_unswitched','B_Naive','B_Naive_activated', 'B_Transitional'],
    "Plasma": [ 'Plasma_IGHA', 'Plasma_IGHG', 'Plasma_XBP1', ],
    "pDC": ['pDC'],
    "Mono": ['Mono_regulatory', 'Mono_classical', 'Mono_nonClassical', 'Mono_inflammatory','Mono_IFNresponse'],
    "DC": ['cDC1', 'cDC2', 'cDC3', 'DC4', 'DC5', 'DC_CCR7'],
    "T_CD4_Naive": ['T_CD4_Naive'],
    "T_CD4_NonNaive": [ 'T_CD4_CM', 'T_CD4_CM_ribo', 'T_CD4_EM', 'T_CD4_EMRA', 'T_CD4_eff', 'Th0','Th1', 'Th2', 'Tregs', 'Tregs_activated',],
    "T_CD8_Naive": ['T_CD8_Naive'],
    "T_CD8_NonNaive": [ 'T_CD8_CM', 'T_CD8_CM_stem', 'T_CD8_EM_CX3CR1high', 'T_CD8_EM_CX3CR1int', 'T_CD8_IFNresponse', 'T_CD8_Mem_cytotoxic', 'T_CD8_activated', 'T_CD8_arrested', 'T_CD8_eff_HOBIT'],
    "UTC": ['gdT_V1', 'gdT_V2_Vγ9',  'MAIT', 'MAIT_17'],
    "ILC": [ 'NK_CD16high', 'NK_CD56dimCD16', 'NK_CD56high', 'NK_IFN1response', 'NK_adaptive', 'NK_lowRibocontent'],
    "Progenitors": [ 'HSC_LMP', 'HSC_MEMP', 'HSC_MMP','B_Progenitors',  'T_Progenitors',],
    "Cycling_cells": ['DC_Proliferative', 'T_Proliferative', 'NK_Proliferative', 'Plasma_Proliferative'],
    "RBC": ['RBC'],
    "Platelets": ['Platelets'],
    "Doublets": ['Doublets'],         
    "LowQuality_cells": ['LowQuality_cells']
}

annotation_level1_unification_rev = {old_label: new_label for new_label, old_labels in annotation_level1_unification.items() for old_label in old_labels}

In [25]:
master_AnnotDF["Level1"] = master_AnnotDF["annotation_step_5"].astype('str')

#Renaming
master_AnnotDF['Level1'] = master_AnnotDF['annotation_step_5'].map(annotation_level1_unification_rev)

# Convert the column back to categorical if needed
master_AnnotDF["Level1"] = master_AnnotDF["Level1"].astype('category')

In [26]:
master_AnnotDF.groupby("Level1").size().sort_values(ascending=False)

Level1
Mono                1049149
T_CD4_Naive          792955
T_CD4_NonNaive       712248
T_CD8_NonNaive       523322
ILC                  376885
B                    356874
LowQuality_cells     241452
Doublets             240766
T_CD8_Naive          213776
Platelets            154085
UTC                  115097
DC                    47076
Cycling_cells         45181
pDC                   22902
Plasma                16854
Progenitors            7033
RBC                    2485
dtype: int64

In [27]:
master_AnnotDF.groupby("Level1").size().sum()

4918140

## Add Level2 annotation

In [28]:
master_AnnotDF["Level2"] = master_AnnotDF["annotation_step_5"].astype('str')
master_AnnotDF["Level2"].unique()

array(['pDC', 'Doublets', 'LowQuality_cells', 'Platelets',
       'DC_Proliferative', 'DC5', 'cDC2', 'cDC3', 'DC_CCR7',
       'Mono_regulatory', 'Mono_classical', 'Mono_nonClassical',
       'Mono_inflammatory', 'Mono_IFNresponse', 'cDC1', 'DC4', 'RBC',
       'T_CD8_eff_HOBIT', 'T_CD4_Naive', 'T_CD4_CM', 'Th0', 'Tregs',
       'T_CD8_activated', 'T_CD8_CM', 'T_CD8_Mem_cytotoxic',
       'T_CD8_IFNresponse', 'T_CD8_EM_CX3CR1high', 'NK_CD56dimCD16',
       'T_CD8_EM_CX3CR1int', 'T_CD4_EM', 'T_CD8_Naive', 'NK_IFN1response',
       'NK_lowRibocontent', 'NK_adaptive', 'NK_CD56high', 'Th1',
       'Tregs_activated', 'T_CD4_EMRA', 'NK_Proliferative',
       'T_CD4_CM_ribo', 'T_Proliferative', 'NK_CD16high', 'T_CD4_eff',
       'T_CD8_arrested', 'Th2', 'T_CD8_CM_stem', 'MAIT_17', 'gdT_V2_Vγ9',
       'gdT_V1', 'MAIT', 'T_Progenitors', 'Plasma_IGHA', 'Plasma_IGHG',
       'Plasma_Proliferative', 'Plasma_XBP1', 'B_Memory_switched',
       'B_Naive', 'HSC_LMP', 'HSC_MMP', 'B_Progenitors', 'HSC_

In [29]:
master_AnnotDF.head()

Unnamed: 0_level_0,annotation_step_1,annotation_step_2,annotation_step_3,annotation_step_4,annotation_step_5,Level1,Level2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG,pDC,pDC,pDC,pDC,pDC,pDC,pDC
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA,pDC,pDC,pDC,pDC,pDC,pDC,pDC
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT,pDC,pDC,pDC,pDC,pDC,pDC,pDC
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT,pDC,pDC,pDC,pDC,pDC,pDC,pDC
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC,pDC,pDC,pDC,pDC,pDC,pDC,pDC


In [30]:
# Save
master_AnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/master_clusterAnnotDF_final.pkl"))

In [31]:
session_info.show()