In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import generate_split_dir


#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300

overwriteFigures = True
overwriteData = True

**Loading annotation pkl**

In [2]:
step = "Step1"
next_step = "Step2"

In [3]:
annotation_col_name = 'annotation_step_1'

In [4]:
file_paths = glob.glob(str(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/{step}/*/results/SCGT00_cluster_res*_annotated.pkl")))
# Load and concatenate all files into a single column DataFrame

In [5]:
for file_path in file_paths: 
    clusterAnnotDF = pd.read_pickle(file_path)
    print(set(clusterAnnotDF[annotation_col_name]))

{'LowQuality_cells', 'Doublets_Step1', 'pDC'}
{'Platelets'}
{'LowQuality_cells', 'DC', 'Doublets_Step1', 'Platelets', 'Mono'}
{'T_Naive', 'NK', 'T_NonNaive', 'Doublets_Step1', 'Cycling'}
{'Plasma', 'Doublets_Step1'}
{'Progenitors', 'RBC'}
{'B', 'Doublets_Step1'}


In [6]:
clusterAnnotDF = pd.concat([pd.read_pickle(file_path) for file_path in file_paths], axis=0)

In [7]:
print(clusterAnnotDF.head())

                                       annotation_step_1
cellID                                                  
SCGT00_L051_I53.3P_T0_AACCAACGTGGGATTG               pDC
SCGT00_L051_I51.3P_T0_AAGGTAATCTTCCCGA               pDC
SCGT00_L051_I58.3P_T0_ACCTGAAAGTGCCAGA               pDC
SCGT00_L051_I51.3P_T0_ACGGGTCTCTTGGTGA               pDC
SCGT00_L051_I53.3P_T0_ACTATGGAGGCGCTCT               pDC


In [8]:
set(clusterAnnotDF[annotation_col_name])

{'B',
 'Cycling',
 'DC',
 'Doublets_Step1',
 'LowQuality_cells',
 'Mono',
 'NK',
 'Plasma',
 'Platelets',
 'Progenitors',
 'RBC',
 'T_Naive',
 'T_NonNaive',
 'pDC'}

In [9]:
print(clusterAnnotDF.shape)

(855417, 1)


In [10]:
clusterAnnotDF.to_pickle(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))

**Updating master annotation DF**

In [11]:
set(clusterAnnotDF[annotation_col_name])

{'B',
 'Cycling',
 'DC',
 'Doublets_Step1',
 'LowQuality_cells',
 'Mono',
 'NK',
 'Plasma',
 'Platelets',
 'Progenitors',
 'RBC',
 'T_Naive',
 'T_NonNaive',
 'pDC'}

In [12]:
# For step1
master_AnnotDF = clusterAnnotDF
master_AnnotDF.to_pickle(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/master_clusterAnnotDF.pkl"))

# Split Lineages

In [36]:
annotation_col_name = 'annotation_step_1'
not_interesting_categories = ["LowQuality_cells", "Doublets_Step1"]
finished_categories = ["Progenitors", "pDC", "Plasma", "RBC", "Platelets", "Cycling"]

In [37]:
cell_counts = master_AnnotDF.groupby(annotation_col_name).size().sort_values(ascending=False)
print(cell_counts)

annotation_step_1
T_Naive             260868
T_NonNaive          241341
Mono                134647
NK                   75377
B                    64308
Doublets_Step1       34797
Platelets            18962
DC                   13801
pDC                   3125
Cycling               2741
LowQuality_cells      2155
Progenitors           1716
Plasma                1383
RBC                    196
dtype: int64


In [38]:
total_cells_to_remove = cell_counts.loc[not_interesting_categories].sum()
print(f"We are removing a total of {total_cells_to_remove} cells")
for category in not_interesting_categories:
    cells_removed = cell_counts.loc[category]
    print(f"{category}: {cells_removed} cells removed")

We are removing a total of 36952 cells
LowQuality_cells: 2155 cells removed
Doublets_Step1: 34797 cells removed


In [39]:
total_cells_finished = cell_counts.loc[finished_categories].sum()
print(f"We have finished a total of {total_cells_finished} cells")
for category in finished_categories:
    cells_finished = cell_counts.loc[category]
    print(f"{category}: {cells_finished} cells finished")

We have finished a total of 28123 cells
Progenitors: 1716 cells finished
pDC: 3125 cells finished
Plasma: 1383 cells finished
RBC: 196 cells finished
Platelets: 18962 cells finished
Cycling: 2741 cells finished


In [40]:
next_step_categories = cell_counts.index.difference(set(finished_categories).union(set(not_interesting_categories)))
total_cells_of_interest = cell_counts.loc[next_step_categories].sum()
print(f"We are still missing a total of {total_cells_of_interest} cells")
for category in next_step_categories:
    cells_missing = cell_counts.loc[category]
    print(f"{category}: {cells_missing} cells missing")

We are still missing a total of 790342 cells
B: 64308 cells missing
DC: 13801 cells missing
Mono: 134647 cells missing
NK: 75377 cells missing
T_Naive: 260868 cells missing
T_NonNaive: 241341 cells missing


In [41]:
master_AnnotDF[annotation_col_name].unique()

array(['pDC', 'LowQuality_cells', 'Doublets_Step1', 'Platelets', 'Mono',
       'DC', 'NK', 'T_Naive', 'T_NonNaive', 'Cycling', 'Plasma',
       'Progenitors', 'RBC', 'B'], dtype=object)

In [42]:
template_path = here('02_cell_annotation/SCGT00_CentralizedDataset/01_fromDataset_to_CellLineages/template')
template_notebook_list = [str(template_path) + '/' + f for f in os.listdir(template_path) if f.endswith('.ipynb')]

annotation_path = str(here(f"02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/{step}/{step}_clusterAnnotDF.pkl"))
targetdir = str(here(f'02_cell_annotation/SCGT00_CentralizedDataset/02_fromCellLineages_to_CellTypes/{next_step}'))
os.makedirs(targetdir)

for cellGroup in master_AnnotDF[annotation_col_name].unique():
    if cellGroup in next_step_categories: 
        print(cellGroup)
        # Generate the directory tree
        os.makedirs(here('{}/{}/results/figures'.format(targetdir, cellGroup)), exist_ok=False)
        for template_notebook in template_notebook_list:
            generate_split_dir(cellGroup = cellGroup, 
                               annotationPath = annotation_path, 
                               targetDir = targetdir, 
                               annotation_col_name = annotation_col_name, # Davide changed this line after editing the function. Check if still works
                               templateNotebook = template_notebook)
    else: 
        print(f"Skipping {cellGroup}")

Skipping pDC
Skipping LowQuality_cells
Skipping Doublets_Step1
Skipping Platelets
Mono
01_qualityControl_HVGselection.ipynb: cellGroup changed!
01_qualityControl_HVGselection.ipynb: annotationDFpath changed!
01_qualityControl_HVGselection.ipynb: annotation column changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: cellGroup changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: annotationDFpath changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: annotation column changed!
05_annotation.ipynb: cellGroup changed!
02_integration_scVI.ipynb: cellGroup changed!
06_computeMarkers_specificResolution.ipynb: cellGroup changed!
04_clustering.ipynb: cellGroup changed!
DC
01_qualityControl_HVGselection.ipynb: cellGroup changed!
01_qualityControl_HVGselection.ipynb: annotationDFpath changed!
01_qualityControl_HVGselection.ipynb: annotation column changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: cellGroup changed!
03_compute_neighborsUMAP_QCmetrics.ipynb: annotationDFpath changed!
03_compute_neighborsUM

In [43]:
session_info.show()