In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import generate_split_dir


#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300

overwriteFigures = True
overwriteData = True

**Loading annotation pkl**

In [2]:
step = "Step2"
next_step = "Step3new"

In [3]:
annotation_col_name = 'annotation_step_2'

In [4]:
file_paths = glob.glob(str(here(f"02_fromCellLineages_to_CellTypes/{step}/*/results/INFLAMMATION_cluster_res*_annotated.pkl")))
# Load and concatenate all files into a single column DataFrame

In [5]:
for file_path in file_paths: 
    print(file_path)
    clusterAnnotDF = pd.read_pickle(file_path)
    print(set(clusterAnnotDF[annotation_col_name]))

/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step2/pDC/results/INFLAMMATION_cluster_res0.75_annotated.pkl
{'pDC', 'Platelets'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step2/NK/results/INFLAMMATION_cluster_res0.25_annotated.pkl
{'T_NonNaive', 'RBC', 'NK'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step2/T_NK_Proliferative/results/INFLAMMATION_cluster_res0.25_annotated.pkl
{'T_NonNaive', 'Doublets_Step2', 'NK'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromCellLineages_to_CellTypes/Step2/DC/results/INFLAMMATION_cluster_res0.75_annotated.pkl
{'DC', 'Doublets_Step2', 'LowQuality_cells'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/02_fromC

In [6]:
clusterAnnotDF = pd.concat([pd.read_pickle(file_path) for file_path in file_paths], axis=0)

In [7]:
print(clusterAnnotDF.head())

                                       annotation_step_2
cellID                                                  
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC


In [8]:
set(clusterAnnotDF[annotation_col_name])

{'B',
 'DC',
 'Doublets',
 'Doublets_Step2',
 'HSC',
 'LowQuality_cells',
 'Mono',
 'NK',
 'Plasma',
 'Platelets',
 'RBC',
 'T_Naive',
 'T_NonNaive',
 'pDC'}

In [9]:
print(clusterAnnotDF.shape)

(4900655, 1)


In [10]:
clusterAnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))

**Updating master annotation DF**

In [11]:
master_AnnotDFpath = str(here("02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))
master_AnnotDF = pd.read_pickle(master_AnnotDFpath)

In [12]:
last_column_name = master_AnnotDF.columns[-1]
master_AnnotDF[annotation_col_name] = master_AnnotDF[last_column_name]
master_AnnotDF[annotation_col_name].update(clusterAnnotDF[annotation_col_name])

In [13]:
print(master_AnnotDF.head())

                                       annotation_step_1 annotation_step_2
cellID                                                                    
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC               pDC
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC               pDC
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC               pDC
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC               pDC
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC               pDC


In [14]:
print(master_AnnotDF.shape)

(4918140, 2)


In [15]:
set(master_AnnotDF[annotation_col_name])

{'B',
 'DC',
 'Doublets',
 'Doublets_Step2',
 'HSC',
 'LowQuality_cells',
 'Mono',
 'NK',
 'Plasma',
 'Platelets',
 'RBC',
 'T_Naive',
 'T_NonNaive',
 'pDC'}

In [16]:
master_AnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))

# Split Lineages

In [17]:
annotation_col_name = 'annotation_step_2'
not_interesting_categories = ["LowQuality_cells", "RBC", "Platelets", "Doublets"]
finished_categories = []

In [18]:
cell_counts = master_AnnotDF.groupby(annotation_col_name).size().sort_values(ascending=False)
print(cell_counts)

annotation_step_2
T_NonNaive          1594678
T_Naive             1084369
Mono                1063880
NK                   390379
B                    357861
Doublets             206502
Platelets             66737
DC                    47276
Doublets_Step2        44943
pDC                   22902
Plasma                20628
LowQuality_cells      10322
HSC                    5178
RBC                    2485
dtype: int64


In [19]:
total_cells_to_remove = cell_counts.loc[not_interesting_categories].sum()
print(f"We are removing a total of {total_cells_to_remove} cells")
for category in not_interesting_categories:
    cells_removed = cell_counts.loc[category]
    print(f"{category}: {cells_removed} cells removed")

We are removing a total of 286046 cells
LowQuality_cells: 10322 cells removed
RBC: 2485 cells removed
Platelets: 66737 cells removed
Doublets: 206502 cells removed


In [20]:
total_cells_finished = cell_counts.loc[finished_categories].sum()
print(f"We have finished a total of {total_cells_finished} cells")
for category in finished_categories:
    cells_finished = cell_counts.loc[category]
    print(f"{category}: {cells_finished} cells finished")

We have finished a total of 0 cells


In [21]:
next_step_categories = cell_counts.index.difference(set(finished_categories).union(set(not_interesting_categories)))
total_cells_of_interest = cell_counts.loc[next_step_categories].sum()
print(f"We are still missing a total of {total_cells_of_interest} cells")
for category in next_step_categories:
    cells_missing = cell_counts.loc[category]
    print(f"{category}: {cells_missing} cells missing")

We are still missing a total of 4632094 cells
B: 357861 cells missing
DC: 47276 cells missing
Doublets_Step2: 44943 cells missing
HSC: 5178 cells missing
Mono: 1063880 cells missing
NK: 390379 cells missing
Plasma: 20628 cells missing
T_Naive: 1084369 cells missing
T_NonNaive: 1594678 cells missing
pDC: 22902 cells missing


In [22]:
master_AnnotDF[annotation_col_name].unique()

array(['pDC', 'Doublets_Step2', 'LowQuality_cells', 'Doublets',
       'Platelets', 'DC', 'Mono', 'RBC', 'T_NonNaive', 'T_Naive', 'NK',
       'Plasma', 'B', 'HSC'], dtype=object)

In [23]:
session_info.show()