In [12]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import generate_split_dir


#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300

overwriteFigures = True
overwriteData = True

**Loading annotation pkl**

In [13]:
step = "Step3"

In [14]:
annotation_col_name = 'annotation_step_3'

In [15]:
file_paths = glob.glob(str(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/{step}/*/results/SCGT00_cluster_res*_annotated.pkl")))
# Load and concatenate all files into a single column DataFrame

In [16]:
for file_path in file_paths: 
    print(file_path)
    clusterAnnotDF = pd.read_pickle(file_path)
    print(set(clusterAnnotDF[annotation_col_name]))

/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/Step3/T_CD4_NonNaive/results/SCGT00_cluster_res0.75_annotated.pkl
{'T_CD4_NonNaive', 'T_CD4_Naive', 'T_CD8_NonNaive'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/Step3/T_CD8_Naive/results/SCGT00_cluster_res0.1_annotated.pkl
{'T_CD8_Naive'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/Step3/T_CD4_Naive/results/SCGT00_cluster_res0.25_annotated.pkl
{'T_CD4_NonNaive', 'T_CD4_Naive'}
/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/Step3/ILC/results/SCGT00_cluster_res0.1_annotated.pkl
{'ILC'}
/scratch_isilon/groups/singlecell

In [17]:
clusterAnnotDF = pd.concat([pd.read_pickle(file_path) for file_path in file_paths], axis=0)

In [18]:
print(clusterAnnotDF.head())

                                       annotation_step_3
cellID                                                  
SCGT00_L051_I57.3P_T0_AAACCCAGTCAAAGAT    T_CD4_NonNaive
SCGT00_L051_I52.3P_T0_AAACGAAAGGCTAACG    T_CD4_NonNaive
SCGT00_L051_I51.3P_T0_AAACGAAGTTCTAACG    T_CD8_NonNaive
SCGT00_L051_I52.3P_T0_AAACGAATCTACGCAA    T_CD4_NonNaive
SCGT00_L051_I55.3P_T0_AAACGCTGTCTTTCTA    T_CD4_NonNaive


In [19]:
set(clusterAnnotDF[annotation_col_name])

{'ILC',
 'LowQuality_cells',
 'Platelets',
 'T_CD4_Naive',
 'T_CD4_NonNaive',
 'T_CD8_Naive',
 'T_CD8_NonNaive',
 'UTC'}

In [20]:
print(clusterAnnotDF.shape)

(514133, 1)


In [21]:
clusterAnnotDF.to_pickle(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))

**Updating master annotation DF**

In [22]:
master_AnnotDFpath = str(here("02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))
master_AnnotDF = pd.read_pickle(master_AnnotDFpath)

In [23]:
last_column_name = master_AnnotDF.columns[-1]
master_AnnotDF[annotation_col_name] = master_AnnotDF[last_column_name]
master_AnnotDF[annotation_col_name].update(clusterAnnotDF[annotation_col_name])

In [24]:
print(master_AnnotDF.head())

                                       annotation_step_1 annotation_step_2  \
cellID                                                                       
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC               pDC   
SCGT00_L051_I51.3P_T0_AAGGTAATCTTCCCGA               pDC               pDC   
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC               pDC   
SCGT00_L051_I51.3P_T0_ACGGGTCTCTTGGTGA               pDC               pDC   
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC               pDC   

                                       annotation_step_3  
cellID                                                    
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC  
SCGT00_L051_I51.3P_T0_AAGGTAATCTTCCCGA               pDC  
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC  
SCGT00_L051_I51.3P_T0_ACGGGTCTCTTGGTGA               pDC  
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC  


In [25]:
print(master_AnnotDF.shape)

(855417, 3)


In [26]:
set(master_AnnotDF[annotation_col_name])

{'B',
 'Cycling',
 'DC',
 'Doublets_Step1',
 'Doublets_Step2',
 'ILC',
 'LowQuality_cells',
 'Mono',
 'Plasma',
 'Platelets',
 'Progenitors',
 'RBC',
 'T_CD4_Naive',
 'T_CD4_NonNaive',
 'T_CD8_Naive',
 'T_CD8_NonNaive',
 'UTC',
 'pDC'}

# Split Lineages

In [27]:
annotation_col_name = 'annotation_step_3'
not_interesting_categories = ["LowQuality_cells", "Doublets_Step1", "Doublets_Step2"]
finished_categories = ["Progenitors", "pDC", "Plasma", "RBC", "Platelets", "Cycling", "B", "DC", "Mono", "UTC", "ILC",  'T_CD4_Naive', 'T_CD4_NonNaive', 'T_CD8_Naive', 'T_CD8_NonNaive']

In [28]:
cell_counts = master_AnnotDF.groupby(annotation_col_name).size().sort_values(ascending=False)
print(cell_counts)

annotation_step_3
T_CD4_NonNaive      159203
Mono                131392
T_CD4_Naive         123116
T_CD8_NonNaive       93256
LowQuality_cells     61139
ILC                  57046
B                    55869
Platelets            52472
T_CD8_Naive          40745
Doublets_Step1       34797
UTC                  23420
DC                   10440
Doublets_Step2        3361
pDC                   3125
Cycling               2741
Progenitors           1716
Plasma                1383
RBC                    196
dtype: int64


In [29]:
total_cells_to_remove = cell_counts.loc[not_interesting_categories].sum()
print(f"We are removing a total of {total_cells_to_remove} cells")
for category in not_interesting_categories:
    cells_removed = cell_counts.loc[category]
    print(f"{category}: {cells_removed} cells removed")

We are removing a total of 99297 cells
LowQuality_cells: 61139 cells removed
Doublets_Step1: 34797 cells removed
Doublets_Step2: 3361 cells removed


In [30]:
total_cells_finished = cell_counts.loc[finished_categories].sum()
print(f"We have finished a total of {total_cells_finished} cells")
for category in finished_categories:
    cells_finished = cell_counts.loc[category]
    print(f"{category}: {cells_finished} cells finished")

We have finished a total of 756120 cells
Progenitors: 1716 cells finished
pDC: 3125 cells finished
Plasma: 1383 cells finished
RBC: 196 cells finished
Platelets: 52472 cells finished
Cycling: 2741 cells finished
B: 55869 cells finished
DC: 10440 cells finished
Mono: 131392 cells finished
UTC: 23420 cells finished
ILC: 57046 cells finished
T_CD4_Naive: 123116 cells finished
T_CD4_NonNaive: 159203 cells finished
T_CD8_Naive: 40745 cells finished
T_CD8_NonNaive: 93256 cells finished


In [31]:
next_step_categories = cell_counts.index.difference(set(finished_categories).union(set(not_interesting_categories)))
total_cells_of_interest = cell_counts.loc[next_step_categories].sum()
print(f"We are still missing a total of {total_cells_of_interest} cells")
for category in next_step_categories:
    cells_missing = cell_counts.loc[category]
    print(f"{category}: {cells_missing} cells missing")

We are still missing a total of 0 cells


In [32]:
master_AnnotDF[annotation_col_name].unique()

array(['pDC', 'LowQuality_cells', 'Doublets_Step1', 'Platelets', 'Mono',
       'DC', 'Doublets_Step2', 'ILC', 'T_CD4_Naive', 'T_CD4_NonNaive',
       'T_CD8_Naive', 'T_CD8_NonNaive', 'UTC', 'Cycling', 'Plasma',
       'Progenitors', 'RBC', 'B'], dtype=object)

In [33]:
master_AnnotDF.to_pickle(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))

## Add Level1 annotation

In [34]:
master_AnnotDF["Level1"] = master_AnnotDF["annotation_step_3"].astype('str')
master_AnnotDF["Level1"].unique()

array(['pDC', 'LowQuality_cells', 'Doublets_Step1', 'Platelets', 'Mono',
       'DC', 'Doublets_Step2', 'ILC', 'T_CD4_Naive', 'T_CD4_NonNaive',
       'T_CD8_Naive', 'T_CD8_NonNaive', 'UTC', 'Cycling', 'Plasma',
       'Progenitors', 'RBC', 'B'], dtype=object)

In [35]:
master_AnnotDF.groupby("Level1").size().sort_values(ascending=False)

Level1
T_CD4_NonNaive      159203
Mono                131392
T_CD4_Naive         123116
T_CD8_NonNaive       93256
LowQuality_cells     61139
ILC                  57046
B                    55869
Platelets            52472
T_CD8_Naive          40745
Doublets_Step1       34797
UTC                  23420
DC                   10440
Doublets_Step2        3361
pDC                   3125
Cycling               2741
Progenitors           1716
Plasma                1383
RBC                    196
dtype: int64

In [36]:
master_AnnotDF.groupby("Level1").size().sum()

855417

In [37]:
# Save
master_AnnotDF.to_pickle(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/master_clusterAnnotDF_final.pkl"))

In [38]:
session_info.show()