In [10]:
## Import Libraries
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

In [8]:
## Utility Functions

# parse the tree file to get label -> ancestor path mapping
def parse_tree_file(file_path: str) -> dict:
    """Parses a 2-space-indented tree file and returns label -> ancestor path mapping."""
    with open(file_path, 'r') as f:
        lines = f.readlines()

    label_to_ancestors = {}
    path_stack = []

    for line in lines:
        stripped = line.lstrip()
        indent = len(line) - len(stripped)
        level = indent // 2  # Assumes 2 spaces per indent

        path_stack = path_stack[:level]
        path_stack.append(stripped.strip())

        if level >= 2:
            label = path_stack[-1]
            label_to_ancestors[label] = path_stack[:-1]  # from root to parent

    return label_to_ancestors

In [4]:
## QC1 : 
## Ensure all predictions have true_phenotype col, over all pred files within lvl3 folders
## if not create from cell_type

# Loop for all files that start with prediction and end with .csv recursively
prediction_files = glob.glob("../results/**/**/level3/predictions_*.csv", recursive=True)

# Loop through each prediction file with progress bar
for pred_file in tqdm(prediction_files, desc="Processing prediction files"):
    # Read the CSV file
    df = pd.read_csv(pred_file)
    # if col true_phenotype does not exist, add it as a copy of cell_type col
    if 'true_phenotype' not in df.columns:
        df['true_phenotype'] = df['cell_type']
        print(f"Updated {pred_file}")
    #update the file in the same path
    df.to_csv(pred_file, index=False)

In [None]:
## QC2 :
## Check for label harmonization errors for each dataset
## MANUAL STEPS: Rename if present, Add to cell_type_hierarchy.txt if needed

dataset = "IMMUcan" # check for datasets individually

# Loop for all files that start with prediction and end with .csv recursively
prediction_files = glob.glob("../results/" + dataset + "/**/level3/predictions_*.csv", recursive=True)

# Read and concatenate all prediction files
preds = pd.concat([pd.read_csv(f) for f in prediction_files], ignore_index=True)

# get all unique predictions and targets
unique_predictions = preds['predicted_phenotype'].unique()
ancestor_map = parse_tree_file("cell_type_hierarchy.txt")

# Get all unique strings from ancestor_map (keys and all values in the ancestor paths)
all_strings_in_ancestor_map = set(ancestor_map.keys())
for ancestors in ancestor_map.values():
    all_strings_in_ancestor_map.update(ancestors)

# Convert to list and check which unique predictions are in this complete set
unique_predictions_in_all_ancestor_strings = [pred for pred in unique_predictions if pred in all_strings_in_ancestor_map]
# preds that are already in full_list
print("Preds in ancestor map:", unique_predictions_in_all_ancestor_strings)

# check which unique predictions are NOT in this complete set
unique_predictions_not_in_all_ancestor_strings = [pred for pred in unique_predictions if pred not in all_strings_in_ancestor_map]

print("Preds not in ancestor map:", unique_predictions_not_in_all_ancestor_strings)

## Manual TODO: Add these to the cell_type_hierarchy.txt
## Manual TODO: For elements in full_list, harmonize label and save



In [None]:
# Loop for all files that start with prediction and end with .csv recursively
prediction_files = glob.glob("../results/**/**/level3/predictions_*.csv", recursive=True)

# Loop through each prediction file with progress bar
for pred_file in tqdm(prediction_files, desc="Processing prediction files"):
    # Read the CSV file
    df = pd.read_csv(pred_file)
    # if col predicted_phenotype has "" replace
    # Check and replace various cell type name variants
    original_predictions = df['predicted_phenotype'].copy()
    
    df['predicted_phenotype'] = df['predicted_phenotype'].replace("Tcell", "T_cell")
    df['predicted_phenotype'] = df['predicted_phenotype'].replace("Unknown", "undefined")
    df['predicted_phenotype'] = df['predicted_phenotype'].replace("NKT", "NK_T_cell")
    df['predicted_phenotype'] = df['predicted_phenotype'].replace("Bcells", "B_cell")
    df['predicted_phenotype'] = df['predicted_phenotype'].replace("Stroma", "Stromal")
    
    # Check if any changes were made and print filename if so
    if not original_predictions.equals(df['predicted_phenotype']):
        print(f"Updated {pred_file}")
    #update the file in the same path
    df.to_csv(pred_file, index=False)

In [5]:
## QC3:
# Create adapted cases for the pretrained methods, to be fairer to them. 

####################

# Algo case B2: 
# - get all method celltypes can predict
# - if true-celltype is not in this list, remove that row
# - evalutate

# Algo case B1: 
# - add all cell type it can predict to the new cell_type_hierarchy
# - evaluate so we get better hier F1 ??? 

deepcelltypes_cell_types = [
    "T_cell", "Treg", "CD4+_T_cell", "CD8+_T_cell", "NK_T_cell", "B_cell", "Plasma_cell", "NK_cell", 
    "Dendritic_cell", "Mast", "Neutrophil", "Macrophage", "Microglial", 
    "Langerhans", "Monocyte", "Epithelial", "Collecting_duct", "Melanocyte", 
    "Goblet", "Paneth", "Enterocyte", "Endocrine", "Alpha_cell", "Beta_cell", 
    "Endothelial", "HSCs", "LymphaticEndothelial", "BloodVesselEndothelial", 
    "Stromal", "Fibroblast", "Stellate", "Myofibroblasts", "SmoothMuscle", 
    "Pericyte", "Mesangial", "CardiacMuscle", "Nerve", "Neuronal", "Glial", 
    "Cancer", "Thrombocyte", "Erythrocyte", "Hepatocyte", "Astrocyte", 
    "EVT", "ICC", "Littoral_cell", "Podocyte", 
    "M1_Macrophage", "M2_Macrophage"
]
# change EVT to Trophoblast

ribca_cell_types = [
    "Treg", "CD4+_T_cell", "CD8+_T_cell", "NK_T_cell", "B_cell", "Plasma_cell", "NK_cell", 
    "Dendritic_cell", "Mast", "Neutrophil", "Granulocyte" 
    "Epithelial", "Endothelial",  
    "Stromal", "Nerve", "Cancer", "M1_Macrophage", "M2_Macrophage"
]

In [11]:
datasets = ["IMMUcan", "cHL_2_MIBI"]
methods = ["ribca", "deepcelltypes"]

for dataset in datasets:
    for method in methods:
        path = f"../results/{dataset}/{method}/level3"

        # loop through all prediction_*.csv in path
        for file in os.listdir(path):
            if file.startswith("predictions_") and file.endswith(".csv"):
                preds = pd.read_csv(os.path.join(path, file))

                # rename cell_type col to true_phenotype
                preds = preds.rename(columns={"cell_type": "true_phenotype"})

                #replace "B_cell" to "B_cells" in preds
                preds['true_phenotype'] = preds['true_phenotype'].replace("B_cells", "B_cell")
                preds['predicted_phenotype'] = preds['predicted_phenotype'].replace("B_cells", "B_cell")
                preds['predicted_phenotype'] = preds['predicted_phenotype'].replace("Tcell", "T_cell")
                preds['predicted_phenotype'] = preds['predicted_phenotype'].replace("Stromal", "Stroma")

                # delete row if value in true_phenotype is not in formatted_cell_types
                preds = preds[preds['true_phenotype'].isin(eval(f"{method}_cell_types"))] # case B2

                # case A
                # build a true list
                true_list = preds['true_phenotype'].unique()
                #add phenotypes to true_list
                true_list = np.append(true_list, ['Macrophage']) # TODO: Add celltypes depending on dataset

                # bin all classes from predicted_phenotypes that are not in true_phenotypes
                preds['predicted_phenotype'] = preds['predicted_phenotype'].where(preds['predicted_phenotype'].isin(true_list), 'Other')

                # save as a new folder
                os.makedirs(f"../results/{dataset}/{method}_adapted/level3/", exist_ok=True)
                print(f"Saving adapted predictions for {dataset} - {method} to ../results/{dataset}/{method}_adapted/level3/{file}")
                preds.to_csv(f"../results/{dataset}/{method}_adapted/level3/{file}", index=False)


Saving adapted predictions for IMMUcan - ribca to ../results/IMMUcan/ribca_adapted/level3/predictions_1.csv
Saving adapted predictions for IMMUcan - ribca to ../results/IMMUcan/ribca_adapted/level3/predictions_0.csv
Saving adapted predictions for IMMUcan - deepcelltypes to ../results/IMMUcan/deepcelltypes_adapted/level3/predictions_4.csv
Saving adapted predictions for IMMUcan - deepcelltypes to ../results/IMMUcan/deepcelltypes_adapted/level3/predictions_2.csv
Saving adapted predictions for IMMUcan - deepcelltypes to ../results/IMMUcan/deepcelltypes_adapted/level3/predictions_1.csv
Saving adapted predictions for IMMUcan - deepcelltypes to ../results/IMMUcan/deepcelltypes_adapted/level3/predictions_0.csv
Saving adapted predictions for IMMUcan - deepcelltypes to ../results/IMMUcan/deepcelltypes_adapted/level3/predictions_3.csv
Saving adapted predictions for cHL_2_MIBI - ribca to ../results/cHL_2_MIBI/ribca_adapted/level3/predictions_4.csv
Saving adapted predictions for cHL_2_MIBI - ribca 