In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import generate_split_dir


#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300

overwriteFigures = True
overwriteData = True

**Loading annotation pkl**

In [2]:
step = "Step3"
next_step = "Step4"

In [3]:
annotation_col_name = 'annotation_step_3'

In [4]:
file_paths = glob.glob(str(here(f"02_fromCellLineages_to_CellTypes/{step}/*/results/INFLAMMATION_cluster_res*_annotated.pkl")))
# Load and concatenate all files into a single column DataFrame

In [5]:
for file_path in file_paths: 
    print(file_path)
    clusterAnnotDF = pd.read_pickle(file_path)
    print(set(clusterAnnotDF[annotation_col_name]))

/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step3/T_NonNaive/results/INFLAMMATION_cluster_res0.25_annotated.pkl
{'T_CD4_NonNaive', 'T_Proliferative', 'LowQuality_cells', 'T_CD8_NonNaive'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step3/pDC/results/INFLAMMATION_cluster_res0.1_annotated.pkl
{'pDC'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step3/NK/results/INFLAMMATION_cluster_res0.25_annotated.pkl
{'NK_Proliferative', 'NK'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step3/DC/results/INFLAMMATION_cluster_res0.1_annotated.pkl
{'DC'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/

In [6]:
clusterAnnotDF = pd.concat([pd.read_pickle(file_path) for file_path in file_paths], axis=0)

In [7]:
print(clusterAnnotDF.head())

                                       annotation_step_3
cellID                                                  
SCGT00_L051_I56.3P_T0_AACGGGAGTTAAGGAT    T_CD8_NonNaive
SCGT00_L051_I52.3P_T0_AACGTCAAGCCGAATG    T_CD8_NonNaive
SCGT00_L051_I52.3P_T0_ACAACCAAGCTGGTGA    T_CD8_NonNaive
SCGT00_L051_I53.3P_T0_ACATGCAGTGGACTGA    T_CD8_NonNaive
SCGT00_L051_I52.3P_T0_ACGATCACAGAGCTAG    T_CD8_NonNaive


In [8]:
set(clusterAnnotDF[annotation_col_name])

{'B',
 'B_Progenitors',
 'DC',
 'Doublets',
 'HSC_LMP',
 'HSC_MEMP',
 'HSC_MMP',
 'LowQuality_cells',
 'Mono',
 'NK',
 'NK_Proliferative',
 'Plasma',
 'Platelets',
 'T_CD4_NonNaive',
 'T_CD8_NonNaive',
 'T_Naive',
 'T_Progenitors',
 'T_Proliferative',
 'pDC'}

In [9]:
print(clusterAnnotDF.shape)

(4632094, 1)


In [10]:
clusterAnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))

**Updating master annotation DF**

In [11]:
master_AnnotDFpath = str(here("02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))
master_AnnotDF = pd.read_pickle(master_AnnotDFpath)

In [12]:
last_column_name = master_AnnotDF.columns[-1]
master_AnnotDF[annotation_col_name] = master_AnnotDF[last_column_name]
master_AnnotDF[annotation_col_name].update(clusterAnnotDF[annotation_col_name])

In [13]:
print(master_AnnotDF.head())

                                       annotation_step_1 annotation_step_2  \
cellID                                                                       
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC               pDC   
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC               pDC   
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC               pDC   
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC               pDC   
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC               pDC   

                                       annotation_step_3  
cellID                                                    
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC  
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC  
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC  
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC  
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC  


In [14]:
print(master_AnnotDF.shape)

(4918140, 3)


In [15]:
set(master_AnnotDF[annotation_col_name])

{'B',
 'B_Progenitors',
 'DC',
 'Doublets',
 'HSC_LMP',
 'HSC_MEMP',
 'HSC_MMP',
 'LowQuality_cells',
 'Mono',
 'NK',
 'NK_Proliferative',
 'Plasma',
 'Platelets',
 'RBC',
 'T_CD4_NonNaive',
 'T_CD8_NonNaive',
 'T_Naive',
 'T_Progenitors',
 'T_Proliferative',
 'pDC'}

In [16]:
master_AnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))

# Split Lineages

In [17]:
annotation_col_name = 'annotation_step_3'
not_interesting_categories = ["LowQuality_cells", "RBC", "Platelets", "Doublets"]
finished_categories = ['HSC_LMP', 'HSC_MEMP', 'HSC_MMP', 'pDC',
                       'T_Progenitors', 'B_Progenitors', 'T_Proliferative', 'NK_Proliferative']

In [18]:
cell_counts = master_AnnotDF.groupby(annotation_col_name).size().sort_values(ascending=False)
print(cell_counts)

annotation_step_3
Mono                1063880
T_Naive             1006731
T_CD8_NonNaive       824758
T_CD4_NonNaive       740761
NK                   376885
B                    356874
Doublets             240680
Platelets            154085
DC                    47276
T_Proliferative       27907
pDC                   22902
Plasma                20520
NK_Proliferative      13494
LowQuality_cells      11869
HSC_LMP                2784
RBC                    2485
HSC_MMP                2002
B_Progenitors           987
T_Progenitors           868
HSC_MEMP                392
dtype: int64


In [19]:
total_cells_to_remove = cell_counts.loc[not_interesting_categories].sum()
print(f"We are removing a total of {total_cells_to_remove} cells")
for category in not_interesting_categories:
    cells_removed = cell_counts.loc[category]
    print(f"{category}: {cells_removed} cells removed")

We are removing a total of 409119 cells
LowQuality_cells: 11869 cells removed
RBC: 2485 cells removed
Platelets: 154085 cells removed
Doublets: 240680 cells removed


In [20]:
total_cells_finished = cell_counts.loc[finished_categories].sum()
print(f"We have finished a total of {total_cells_finished} cells")
for category in finished_categories:
    cells_finished = cell_counts.loc[category]
    print(f"{category}: {cells_finished} cells finished")

We have finished a total of 71336 cells
HSC_LMP: 2784 cells finished
HSC_MEMP: 392 cells finished
HSC_MMP: 2002 cells finished
pDC: 22902 cells finished
T_Progenitors: 868 cells finished
B_Progenitors: 987 cells finished
T_Proliferative: 27907 cells finished
NK_Proliferative: 13494 cells finished


In [21]:
next_step_categories = cell_counts.index.difference(set(finished_categories).union(set(not_interesting_categories)))
total_cells_of_interest = cell_counts.loc[next_step_categories].sum()
print(f"We are still missing a total of {total_cells_of_interest} cells")
for category in next_step_categories:
    cells_missing = cell_counts.loc[category]
    print(f"{category}: {cells_missing} cells missing")

We are still missing a total of 4437685 cells
B: 356874 cells missing
DC: 47276 cells missing
Mono: 1063880 cells missing
NK: 376885 cells missing
Plasma: 20520 cells missing
T_CD4_NonNaive: 740761 cells missing
T_CD8_NonNaive: 824758 cells missing
T_Naive: 1006731 cells missing


In [22]:
master_AnnotDF[annotation_col_name].unique()

array(['pDC', 'Doublets', 'LowQuality_cells', 'Platelets', 'DC', 'Mono',
       'RBC', 'T_CD8_NonNaive', 'T_Naive', 'T_CD4_NonNaive', 'NK',
       'NK_Proliferative', 'T_Proliferative', 'T_Progenitors', 'Plasma',
       'B', 'HSC_LMP', 'HSC_MMP', 'B_Progenitors', 'HSC_MEMP'],
      dtype=object)

In [23]:
session_info.show()