In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import generate_split_dir


#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300

overwriteFigures = True
overwriteData = True

**Loading annotation pkl**

In [2]:
step = "Step3"
next_step = "Step4"

In [3]:
annotation_col_name = 'annotation_step_3'

In [4]:
file_paths = glob.glob(str(here(f"02_fromCellLineages_to_CellTypes/{step}/*/results/INFLAMMATION_cluster_res*_annotated.pkl")))
# Load and concatenate all files into a single column DataFrame

In [5]:
for file_path in file_paths: 
    clusterAnnotDF = pd.read_pickle(file_path)
    print(set(clusterAnnotDF[annotation_col_name]))

{'Doublets_Step3', 'DC'}
{'Plasma', 'LowQuality_cells'}
{'T_NonNaive', 'T_Naive'}
{'LowQuality_cells', 'B'}


In [6]:
clusterAnnotDF = pd.concat([pd.read_pickle(file_path) for file_path in file_paths], axis=0)

In [7]:
print(clusterAnnotDF.head())

                                       annotation_step_3
cellID                                                  
SCGT00_L051_I52.3P_T0_GAGCTGCTCTGTTGGA                DC
SCGT00_L051_I55.3P_T0_TTGCTGCGTATGTCTG                DC
SCGT00_L035_I55.3P_T0_GTGAGGATCCCATAGA                DC
SCGT00_L035_I52.3P_T0_GTTGCGGCAACTCCAA                DC
SCGT00_L029_I58.3P_T0_GAGTTTGGTCCGCAGT                DC


In [8]:
set(clusterAnnotDF[annotation_col_name])

{'B',
 'DC',
 'Doublets_Step3',
 'LowQuality_cells',
 'Plasma',
 'T_Naive',
 'T_NonNaive'}

In [9]:
print(clusterAnnotDF.shape)

(2930553, 1)


In [10]:
clusterAnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))

**Updating master annotation DF**

In [11]:
master_AnnotDFpath = str(here("02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))
master_AnnotDF = pd.read_pickle(master_AnnotDFpath)

In [12]:
last_column_name = master_AnnotDF.columns[-1]
master_AnnotDF[annotation_col_name] = master_AnnotDF[last_column_name]
master_AnnotDF[annotation_col_name].update(clusterAnnotDF[annotation_col_name])

In [14]:
print(master_AnnotDF.head())

                                       annotation_step_1 annotation_step_2  \
cellID                                                                       
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC               pDC   
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC               pDC   
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC               pDC   
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC               pDC   
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC               pDC   

                                       annotation_step_3  
cellID                                                    
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC  
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC  
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC  
SCGT00_L051_I55.3P_T0_ACTTTCACATGGGTTT               pDC  
SCGT00_L051_I52.3P_T0_ATCACAGTCATCGGGC               pDC  


In [15]:
print(master_AnnotDF.shape)

(4919089, 3)


In [16]:
set(master_AnnotDF[annotation_col_name])

{'B',
 'DC',
 'Doublets',
 'Doublets_Step2',
 'Doublets_Step3',
 'HSC',
 'LowQuality_cells',
 'Mono',
 'NK',
 'Plasma',
 'Platelets',
 'RBC',
 'T_Naive',
 'T_NonNaive',
 'pDC'}

In [18]:
master_AnnotDF.to_pickle(here(f"02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))

# Split Lineages

In [19]:
annotation_col_name = 'annotation_step_3'
not_interesting_categories = ["LowQuality_cells", "Doublets", "RBC", "Platelets", "Doublets_Step2"]
finished_categories = ["HSC", "Mono", "NK", "pDC"]

In [21]:
cell_counts = master_AnnotDF.groupby(annotation_col_name).size().sort_values(ascending=False)
print(cell_counts)

annotation_step_3
T_NonNaive          1422966
T_Naive             1086752
Mono                1022749
NK                   455612
B                    343182
Doublets             207450
LowQuality_cells     101976
Doublets_Step2        92874
Platelets             76922
DC                    55084
pDC                   22059
Plasma                19917
HSC                    7554
RBC                    2642
Doublets_Step3         1350
dtype: int64


In [22]:
total_cells_to_remove = cell_counts.loc[not_interesting_categories].sum()
print(f"We are removing a total of {total_cells_to_remove} cells")
for category in not_interesting_categories:
    cells_removed = cell_counts.loc[category]
    print(f"{category}: {cells_removed} cells removed")

We are removing a total of 481864 cells
LowQuality_cells: 101976 cells removed
Doublets: 207450 cells removed
RBC: 2642 cells removed
Platelets: 76922 cells removed
Doublets_Step2: 92874 cells removed


In [24]:
total_cells_finished = cell_counts.loc[finished_categories].sum()
print(f"We have finished a total of {total_cells_finished} cells")
for category in finished_categories:
    cells_finished = cell_counts.loc[category]
    print(f"{category}: {cells_finished} cells finished")

We have finished a total of 1507974 cells
HSC: 7554 cells finished
Mono: 1022749 cells finished
NK: 455612 cells finished
pDC: 22059 cells finished


In [26]:
next_step_categories = cell_counts.index.difference(set(finished_categories).union(set(not_interesting_categories)))
total_cells_of_interest = cell_counts.loc[next_step_categories].sum()
print(f"We are still missing a total of {total_cells_of_interest} cells")
for category in next_step_categories:
    cells_missing = cell_counts.loc[category]
    print(f"{category}: {cells_missing} cells missing")

We are still missing a total of 2929251 cells
B: 343182 cells missing
DC: 55084 cells missing
Doublets_Step3: 1350 cells missing
Plasma: 19917 cells missing
T_Naive: 1086752 cells missing
T_NonNaive: 1422966 cells missing


In [27]:
master_AnnotDF[annotation_col_name].unique()

array(['pDC', 'DC', 'LowQuality_cells', 'Doublets', 'Platelets',
       'Doublets_Step3', 'Doublets_Step2', 'Mono', 'B', 'NK', 'T_Naive',
       'T_NonNaive', 'Plasma', 'RBC', 'HSC'], dtype=object)

In [28]:
template_path = here('01_fromDatasets_to_CellLineages/template')
template_notebook_list = [str(template_path) + '/' + f for f in os.listdir(template_path) if f.endswith('.ipynb')]

annotation_path = str(here(f"02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))
targetdir = str(here(f'02_fromCellLineages_to_CellTypes/{next_step}'))
os.makedirs(targetdir)

for cellGroup in master_AnnotDF[annotation_col_name].unique():
    if cellGroup in next_step_categories: 
        print(cellGroup)
        # Generate the directory tree
        os.makedirs(here('{}/{}/results/figures'.format(targetdir, cellGroup)), exist_ok=False)
        for template_notebook in template_notebook_list:
            generate_split_dir(cellGroup = cellGroup, 
                               annotationPath = annotation_path, 
                               targetDir = targetdir, 
                               annotation_col_name = annotation_col_name, # Davide changed this line after editing the function. Check if still works
                               templateNotebook = template_notebook)
    else: 
        print(f"Skipping {cellGroup}")

Skipping pDC
DC
01_qualityControl_HVGselection.ipynb: cellGroup changed!
01_qualityControl_HVGselection.ipynb: annotationDFpath changed!
01_qualityControl_HVGselection.ipynb: annotation column changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: cellGroup changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: annotationDFpath changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: annotation column changed!
04a_clustering.ipynb: cellGroup changed!
05_compute_markers_for_specific_resolution.ipynb: cellGroup changed!
04b_annotation.ipynb: cellGroup changed!
following_split_step_to_step2.ipynb: cellGroup changed!
following_split_step_to_step2.ipynb: annotationDFpath changed!
following_split_step_to_step2.ipynb: annotation column changed!
__05b_compute_markers_for_annotated_celltypes.ipynb: cellGroup changed!
02_scVI_integration.ipynb: cellGroup changed!
Skipping LowQuality_cells
Skipping Doublets
Skipping Platelets
Doublets_Step3
01_qualityControl_HVGselection.ipynb: cellGroup changed!
01_qualityCon

In [None]:
session_info.show()