In [44]:
import pandas as pd
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
from scipy.stats import entropy
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score, silhouette_score
import scanpy as sc

In [46]:
DATASET_DIR = "../dataset"
DATASET_NAMES = ["PBMC1", "PBMC2", "PBMC3", "PBMC4"]
TOOLS = ['COTAN', 'monocle', 'scanpy', 'scvi-tools', 'seurat']
PARAMS_TUNING = ['default', 'celltypist', 'protein']

In [55]:
for tuning in PARAMS_TUNING:
    for dataset in DATASET_NAMES:
        print("------------------------------")
        print(dataset)
        # concat tools labels
        labels_df = pd.read_csv(f'../results/{dataset}/{TOOLS[0]}/{tuning}/clustering_labels.csv', index_col=0)
        labels_df.rename(columns={"cluster": "cluster_COTAN"}, inplace=True)
        for tool in TOOLS[1:]:
            tool_labels_df = pd.read_csv(f'../results/{dataset}/{tool}/{tuning}/clustering_labels.csv', index_col=0)
            labels_df = labels_df.merge(tool_labels_df, how='inner', on='cell')
            labels_df.rename(columns={"cluster": f"cluster_{tool}"}, inplace=True)
        # concat celltypist labels
        celltypist_df = pd.read_csv(f'../dataset/{dataset}-Filtered/raw/celltypist_labels.csv', index_col=0)
        labels_df = labels_df.merge(celltypist_df, how='inner', on='cell')
        labels_df.rename(columns={"cluster.ids": f"cluster_celltypist"}, inplace=True)
        # concat protein surface labels
        protein_df = pd.read_csv(f'../dataset/{dataset}-Filtered/10x/labels.csv', index_col=0)
        labels_df = labels_df.merge(protein_df, how='inner', on='cell')
        labels_df.rename(columns={"cluster.ids": f"cluster_protein"}, inplace=True)
        display(labels_df)

        # read dataset
        adata = sc.read_10x_mtx(
            f'../dataset/{dataset}-Filtered/10X/',
            var_names='gene_symbols',
            cache=False
        )
        # keep only cells labeled
        adata.var_names_make_unique()
        subset_cells = adata.obs_names.isin(labels_df.index)
        adata = adata[subset_cells, :]

        # compute silhouette score
        silhouette = {}
        for tool in TOOLS:
            silhouette[tool] = silhouette_score(adata.X, labels_df[f'cluster_{tool}'])
        silhouette['celltypist'] = silhouette_score(adata.X, labels_df[f'cluster_celltypist'])
        silhouette['protein'] = silhouette_score(adata.X, labels_df[f'cluster_protein'])
        silhouette_df = pd.DataFrame(silhouette, index=[0])
        display(silhouette_df)
        silhouette_df.to_csv(f'../results/{dataset}/silhouette.csv')
        silhouette_df.to_latex(f'../results/{dataset}/silhouette.tex')

        # compute scores comparing each tool labels with celltypist labels
        if tuning == 'celltypist' or tuning == 'deafault':
            scores_celltypist = {}
            scores_celltypist['NMI_celltypist'] = {}
            scores_celltypist['ARI_celltypist'] = {}
            scores_celltypist['homogeneity_celltypist'] = {}
            scores_celltypist['completeness_celltypist'] = {}
            scores_celltypist['v_measure_celltypist'] = {}
            scores_celltypist['fowlkes_mallows_celltypist'] = {}
            for tool in TOOLS:
                scores_celltypist['NMI_celltypist'][tool] = normalized_mutual_info_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_celltypist'], average_method='arithmetic')
                scores_celltypist['ARI_celltypist'][tool] = adjusted_rand_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_celltypist'])
                scores_celltypist['homogeneity_celltypist'][tool] = homogeneity_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_celltypist'])
                scores_celltypist['completeness_celltypist'][tool] = completeness_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_celltypist'])
                scores_celltypist['v_measure_celltypist'][tool] = v_measure_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_celltypist'])
                scores_celltypist['fowlkes_mallows_celltypist'][tool] = fowlkes_mallows_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_celltypist'])
            scores_celltypist_df = pd.DataFrame(scores_celltypist)
            scores_celltypist_df.to_csv(f'../results/{dataset}/{tuning}_scores_celltypist.csv')
            scores_celltypist_df.to_latex(f'../results/{dataset}/{tuning}_scores_celltypist.tex')
            display(scores_celltypist_df)

        # compute scores comparing each tool labels with protein labels
        if tuning == 'protein' or tuning == 'deafault':
            scores_protein = {}
            scores_protein['NMI_protein'] = {}
            scores_protein['ARI_protein'] = {}
            scores_protein['homogeneity_protein'] = {}
            scores_protein['completeness_protein'] = {}
            scores_protein['v_measure_protein'] = {}
            scores_protein['fowlkes_mallows_protein'] = {}
            for tool in TOOLS:
                scores_protein['NMI_protein'][tool] = normalized_mutual_info_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_protein'], average_method='arithmetic')
                scores_protein['ARI_protein'][tool] = adjusted_rand_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_protein'])
                scores_protein['homogeneity_protein'][tool] = homogeneity_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_protein'])
                scores_protein['completeness_protein'][tool] = completeness_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_protein'])
                scores_protein['v_measure_protein'][tool] = v_measure_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_protein'])
                scores_protein['fowlkes_mallows_protein'][tool] = fowlkes_mallows_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df['cluster_protein'])
            scores_protein_df = pd.DataFrame(scores_protein)
            scores_protein_df.to_csv(f'../results/{dataset}/{tuning}_scores_protein.csv')
            scores_protein_df.to_latex(f'../results/{dataset}/{tuning}_scores_protein.tex')
            display(scores_protein_df)


Unnamed: 0_level_0,cluster_COTAN,cluster_monocle,cluster_scanpy,cluster_scvi-tools,cluster_seurat,cluster_celltypist,cluster_protein
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACCCAGTAGTTAGA,6,2,6,6,4,1,7
AAACGAAGTAACGATA,6,2,6,6,4,1,7
AAACGAAGTGGATCAG,4,2,4,4,3,1,7
AAACGAATCATGAGAA,10,1,10,10,1,2,2
AAACGCTAGGATAATC,10,1,10,10,1,2,3
...,...,...,...,...,...,...,...
TTTGGAGAGGTAGCCA,10,1,10,10,1,2,2
TTTGGAGGTATCGATC,13,1,13,13,2,5,3
TTTGGTTCAATTTCCT,5,2,5,5,3,1,7
TTTGGTTGTTGGAGAC,13,1,13,13,2,5,1


  if not is_categorical_dtype(df_full[k]):


{'COTAN': 0.000697303,
 'monocle': 0.16298433,
 'scanpy': 0.000697303,
 'scvi-tools': 0.000697303,
 'seurat': 0.026355628,
 'celltypist': 0.020001175,
 'protein': 0.033428274}

FileNotFoundError: [Errno 2] No such file or directory: '../results/PBMC1/COTAN/celltypist/clustering_labels.csv'