In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import generate_split_dir


#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300

overwriteFigures = True
overwriteData = True

**Loading annotation pkl**

In [2]:
step = "Step1"
next_step = "Step2"

In [3]:
annotation_col_name = 'annotation_step_1'

In [4]:
file_paths = glob.glob(str(here(f"02_fromCellLineages_to_CellTypes/{step}/*/results/INFLAMMATION_cluster_res*_annotated.pkl")))
# Load and concatenate all files into a single column DataFrame

In [5]:
for file_path in file_paths: 
    clusterAnnotDF = pd.read_pickle(file_path)
    print(set(clusterAnnotDF[annotation_col_name]))

{'Doublets_Step1', 'pDC', 'DC', 'LowQuality_cells'}
{'LowQuality_cells', 'Platelets'}
{'Mono', 'DC', 'Doublets_Step1'}
{'T_NK_Proliferative', 'T', 'LowQuality_cells', 'NK'}
{'B', 'Plasma', 'Doublets_Step1'}
{'HSC', 'Mono', 'RBC', 'LowQuality_cells', 'Doublets_Step1'}
{'B', 'Doublets_Step1'}


In [6]:
clusterAnnotDF = pd.concat([pd.read_pickle(file_path) for file_path in file_paths], axis=0)

In [7]:
print(clusterAnnotDF.head())

                                       annotation_step_1
cellID                                                  
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC


In [8]:
set(clusterAnnotDF[annotation_col_name])

{'B',
 'DC',
 'Doublets_Step1',
 'HSC',
 'LowQuality_cells',
 'Mono',
 'NK',
 'Plasma',
 'Platelets',
 'RBC',
 'T',
 'T_NK_Proliferative',
 'pDC'}

In [9]:
print(clusterAnnotDF.shape)

(4918140, 1)


In [10]:
clusterAnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))

**Updating master annotation DF**

In [11]:
# For step 1, save clusterAnnotDF as master_AnnotDF
master_AnnotDF = clusterAnnotDF
clusterAnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))

# Split Lineages

In [12]:
annotation_col_name = 'annotation_step_1'
not_interesting_categories = ["LowQuality_cells", "RBC", "Platelets"]
finished_categories = []

In [13]:
cell_counts = master_AnnotDF.groupby(annotation_col_name).size().sort_values(ascending=False)
print(cell_counts)

annotation_step_1
T                     2592743
Mono                  1118573
NK                     480196
B                      358113
Doublets_Step1         207963
DC                      57358
T_NK_Proliferative      35481
pDC                     23188
Plasma                  19748
Platelets               14824
HSC                      7292
LowQuality_cells         1999
RBC                       662
dtype: int64


In [14]:
total_cells_to_remove = cell_counts.loc[not_interesting_categories].sum()
print(f"We are removing a total of {total_cells_to_remove} cells")
for category in not_interesting_categories:
    cells_removed = cell_counts.loc[category]
    print(f"{category}: {cells_removed} cells removed")

We are removing a total of 17485 cells
LowQuality_cells: 1999 cells removed
RBC: 662 cells removed
Platelets: 14824 cells removed


In [15]:
total_cells_finished = cell_counts.loc[finished_categories].sum()
print(f"We have finished a total of {total_cells_finished} cells")
for category in finished_categories:
    cells_finished = cell_counts.loc[category]
    print(f"{category}: {cells_finished} cells finished")

We have finished a total of 0 cells


In [16]:
next_step_categories = cell_counts.index.difference(set(finished_categories).union(set(not_interesting_categories)))
total_cells_of_interest = cell_counts.loc[next_step_categories].sum()
print(f"We are still missing a total of {total_cells_of_interest} cells")
for category in next_step_categories:
    cells_missing = cell_counts.loc[category]
    print(f"{category}: {cells_missing} cells missing")

We are still missing a total of 4900655 cells
B: 358113 cells missing
DC: 57358 cells missing
Doublets_Step1: 207963 cells missing
HSC: 7292 cells missing
Mono: 1118573 cells missing
NK: 480196 cells missing
Plasma: 19748 cells missing
T: 2592743 cells missing
T_NK_Proliferative: 35481 cells missing
pDC: 23188 cells missing


In [17]:
master_AnnotDF[annotation_col_name].unique()

array(['pDC', 'DC', 'LowQuality_cells', 'Doublets_Step1', 'Platelets',
       'Mono', 'T', 'NK', 'T_NK_Proliferative', 'Plasma', 'B', 'HSC',
       'RBC'], dtype=object)

In [18]:
session_info.show()