In [1]:
# import libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, matthews_corrcoef, cohen_kappa_score, adjusted_rand_score, normalized_mutual_info_score

In [2]:
# calculate the cell type distribution in terms of percentages for predicted and true phenotypes
def calculate_cell_type_distribution(df):
    """
    Calculate the cell type distribution in terms of percentages for predicted and true phenotypes.

    Parameters:
        df (pd.DataFrame): DataFrame containing the predictions.

    Returns:
        pd.DataFrame: DataFrame containing the cell type distribution for predicted and true phenotypes.
    """
    # Get the counts and percentages for predicted phenotypes
    counts_predicted = df['predicted_phenotype'].value_counts()
    percentages_predicted = counts_predicted / counts_predicted.sum() * 100
    predicted_distribution = pd.DataFrame({'cell_type': counts_predicted.index, 'predicted_percentage': percentages_predicted.values})

    # Get the counts and percentages for true phenotypes
    counts_true = df['true_phenotype'].value_counts()
    percentages_true = counts_true / counts_true.sum() * 100
    true_distribution = pd.DataFrame({'cell_type': counts_true.index, 'true_percentage': percentages_true.values})

    # Merge the two distributions
    distribution_df = pd.merge(predicted_distribution, true_distribution, on='cell_type', how='outer').fillna(0)

    return distribution_df

# calcualte r2 and pearson correlation for the predicted and true phenotypes
def calculate_r2_and_pearson(df):
    """
    Calculate R2 and Pearson correlation for the predicted and true phenotypes.

    Parameters:
        df (pd.DataFrame): DataFrame containing the predictions.

    Returns:
        tuple: R2 and Pearson correlation values.
    """
    # Calculate R2
    r2 = df['predicted_percentage'].corr(df['true_percentage']) ** 2

    # Calculate Pearson correlation
    pearson_corr = df['predicted_percentage'].corr(df['true_percentage'])

    return r2, pearson_corr

# calculate metrics for multiple methods and levels
def calculate_metrics_for_methods(methods, base_path):
    """
    Calculate metrics for multiple methods and levels.

    Parameters:
        methods (list): List of methods to iterate over.
        levels (list): List of levels to iterate over.
        base_path (str): Base path to the results directory.

    Returns:
        pd.DataFrame: DataFrame containing all results.
    """
    all_results = []

    for method in methods:
        method_path = os.path.join(base_path, method)
        levels_in_method = [f for f in os.listdir(method_path) if os.path.isdir(os.path.join(method_path, f))]
        for level in levels_in_method:
            # Update the path for the current method and level
            path = os.path.join(base_path, method, level)
            
            # Get all the predictions* csv files from the path
            files = [f for f in os.listdir(path) if f.startswith("predictions") and f.endswith(".csv")]
            
            # Loop through the files and read them into dataframes
            for file in files:
                # Read the csv file into a dataframe
                df = pd.read_csv(os.path.join(path, file))
                # Get the name of the file without the extension
                fold_name = os.path.splitext(file)[0]
                
                # Calculate the F1 score metric
                f1 = f1_score(df['true_phenotype'], df['predicted_phenotype'], average='weighted')
                # Calculate the accuracy metric
                accuracy = (df['true_phenotype'] == df['predicted_phenotype']).mean()
                # Calculate the macro F1 score metric
                macro_f1 = f1_score(df['true_phenotype'], df['predicted_phenotype'], average='macro')
                # Calculate the MCC metric
                mcc = matthews_corrcoef(df['true_phenotype'], df['predicted_phenotype'])
                # Calculate the Cohen's kappa metric
                kappa = cohen_kappa_score(df['true_phenotype'], df['predicted_phenotype'])
                
                # calcualte the cell type composition metrics
                cell_type_distribution = calculate_cell_type_distribution(df)
                r2, pearson_corr = calculate_r2_and_pearson(cell_type_distribution)
                ari = adjusted_rand_score(df['true_phenotype'], df['predicted_phenotype'])
                nmi = normalized_mutual_info_score(df['true_phenotype'], df['predicted_phenotype'])


                # Append all metrics to the list
                all_results.append({
                    'method': method,
                    'level': level,
                    'fold': fold_name,
                    'f1_weighted': f1,
                    'accuracy': accuracy,
                    'macro_f1': macro_f1,
                    'mcc': mcc,
                    'kappa': kappa,
                    'r2': r2,
                    'pearson_corr': pearson_corr,
                    'ari': ari,
                    'nmi': nmi,
                })

    # Create a final dataframe from the results of all methods and levels
    return pd.DataFrame(all_results)
    

In [3]:
base_path = "../results/IMMUcan"

#make a list of folders in the base_path
methods = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]

# Calculate metrics for all methods and levels
results = calculate_metrics_for_methods(methods, base_path)
# Get the average of the metrics for each method and level, excluding the 'fold' column
average_results = results.drop(columns=["fold"]).groupby(['method', 'level'], as_index=False).mean()
# Get the standard deviation of the metrics for each method and level
std_results = results.drop(columns=["fold"]).groupby(['method', 'level']).std().reset_index()
# Merge the average and standard deviation results
final_results = pd.merge(average_results, std_results, on=['method', 'level'], suffixes=('_mean', '_std'))

# Rename the columns for clarity
final_results.columns = ['method', 'level', 'f1_weighted_mean', 'accuracy_mean', 'macro_f1_mean', 
                         'mcc_mean', 'kappa_mean', 'r2_mean', 'pearson_corr_mean', 'ari_mean', 'nmi_mean',
                          'f1_weighted_std', 'accuracy_std', 'macro_f1_std', 
                          'mcc_std', 'kappa_std', 'r2_std', 'pearson_corr_std', 'ari_std', 'nmi_std']

# calculate a stability metric for the methods where s = (1 - std/stability_thresh)
stability_thresh = 0.1
final_results['stability'] = 1 - (final_results['f1_weighted_std'] / stability_thresh)
# set to 0 if negative
final_results.loc[final_results['stability'] < 0, 'stability'] = 0

# Save the final results to a CSV file with ';' as the separator
final_results.to_csv(os.path.join(base_path, "final_results.csv"), index=False, sep=';')


In [4]:
final_results

Unnamed: 0,method,level,f1_weighted_mean,accuracy_mean,macro_f1_mean,mcc_mean,kappa_mean,r2_mean,pearson_corr_mean,ari_mean,...,f1_weighted_std,accuracy_std,macro_f1_std,mcc_std,kappa_std,r2_std,pearson_corr_std,ari_std,nmi_std,stability
0,celllens_full,level1,0.769918,0.787595,0.434116,0.654172,0.650998,0.984719,0.992304,0.533436,...,0.017035,0.012976,0.026782,0.020599,0.023170,0.016122,0.008137,0.018009,0.013936,0.829650
1,celllens_full,level2,0.712484,0.738562,0.428681,0.617943,0.611200,0.946736,0.972924,0.553777,...,0.015319,0.006264,0.011056,0.013372,0.014985,0.026935,0.013928,0.025672,0.026046,0.846814
2,celllens_full,level3,0.572912,0.629216,0.299731,0.498525,0.489738,0.926878,0.962652,0.566756,...,0.015879,0.008192,0.026092,0.016501,0.017503,0.028717,0.014956,0.016632,0.025716,0.841205
3,flowsom,level1,0.852141,0.859788,0.691283,0.772601,0.771810,0.996667,0.998331,0.682376,...,0.006584,0.002249,0.033743,0.003562,0.003840,0.002917,0.001462,0.006339,0.005159,0.934159
4,flowsom,level2,0.820554,0.824977,0.694329,0.745247,0.744558,0.993402,0.996695,0.682130,...,0.005602,0.002859,0.018134,0.004535,0.004676,0.001896,0.000951,0.006001,0.005587,0.943981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,tribus,level2,0.718980,0.716802,0.539646,0.603690,0.596876,0.858396,0.926153,0.509417,...,0.009086,0.011575,0.009419,0.014543,0.015049,0.051805,0.028198,0.018449,0.016810,0.909138
59,tribus,level3,0.627592,0.591880,0.408982,0.486988,0.479464,0.878768,0.937326,0.512076,...,0.028453,0.025541,0.048844,0.038178,0.037236,0.028664,0.015294,0.020782,0.033737,0.715468
60,xgboost_default_StratifiedGroupKFold,level1,0.951263,0.951099,0.910843,0.921970,0.921758,0.997991,0.998994,0.873384,...,0.003902,0.003934,0.005540,0.005918,0.006177,0.002940,0.001473,0.011706,0.010494,0.960979
61,xgboost_default_StratifiedGroupKFold,level2,0.943194,0.942884,0.909173,0.918092,0.917848,0.998344,0.999171,0.876479,...,0.002252,0.002388,0.001949,0.002772,0.002852,0.000755,0.000378,0.006347,0.005257,0.977477


In [5]:
final_results['overall_score'] = (final_results['f1_weighted_mean'] + final_results['accuracy_mean'] + 
                                   final_results['macro_f1_mean'] + final_results['mcc_mean'] + 
                                   final_results['kappa_mean'] + final_results['r2_mean'] + 
                                   final_results['pearson_corr_mean'] + final_results['ari_mean'] + 
                                   final_results['nmi_mean'] + final_results['stability']) / 10

In [6]:
final_results = final_results.sort_values(by='overall_score', ascending=False)
final_results

Unnamed: 0,method,level,f1_weighted_mean,accuracy_mean,macro_f1_mean,mcc_mean,kappa_mean,r2_mean,pearson_corr_mean,ari_mean,...,accuracy_std,macro_f1_std,mcc_std,kappa_std,r2_std,pearson_corr_std,ari_std,nmi_std,stability,overall_score
60,xgboost_default_StratifiedGroupKFold,level1,0.951263,0.951099,0.910843,0.921970,0.921758,0.997991,0.998994,0.873384,...,0.003934,0.005540,0.005918,0.006177,0.002940,0.001473,0.011706,0.010494,0.960979,0.928084
61,xgboost_default_StratifiedGroupKFold,level2,0.943194,0.942884,0.909173,0.918092,0.917848,0.998344,0.999171,0.876479,...,0.002388,0.001949,0.002772,0.002852,0.000755,0.000378,0.006347,0.005257,0.977477,0.928012
62,xgboost_default_StratifiedGroupKFold,level3,0.925801,0.924738,0.883379,0.901992,0.901470,0.999103,0.999552,0.873386,...,0.003794,0.005645,0.005070,0.005228,0.000287,0.000144,0.010562,0.006439,0.963755,0.917621
47,random_forest_default_StratifiedGroupKFold,level2,0.926403,0.927633,0.883129,0.895041,0.894812,0.998078,0.999038,0.844209,...,0.002263,0.002480,0.003204,0.003285,0.001453,0.000727,0.006338,0.006045,0.975685,0.909740
48,random_forest_default_StratifiedGroupKFold,level3,0.915094,0.916437,0.859490,0.889146,0.888939,0.998561,0.999280,0.857571,...,0.004398,0.005829,0.005101,0.005069,0.000209,0.000104,0.011926,0.006960,0.960451,0.906880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,most_frequent_default_StratifiedGroupKFold,level3,0.291131,0.461164,0.045086,0.000000,0.000000,0.935639,0.967282,0.000000,...,0.007799,0.000524,0.000000,0.000000,0.004522,0.002338,0.000000,0.000000,0.917417,0.361772
55,stratified_default_StratifiedGroupKFold,level3,0.244980,0.244983,0.070295,0.000133,0.000132,0.990173,0.995072,0.000203,...,0.009978,0.000457,0.000887,0.000885,0.004791,0.002408,0.000995,0.000022,0.980526,0.352664
26,most_frequent_default_StratifiedGroupKFold,level2,0.291479,0.461374,0.126254,0.000000,0.000000,0.782001,0.883953,0.000000,...,0.017438,0.003267,0.000000,0.000000,0.049681,0.028019,0.000000,0.000000,0.814549,0.335961
49,ribca,level3,0.322905,0.323392,0.153901,0.271541,0.245158,0.171240,0.413812,0.095452,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.319883


In [7]:
# take only level3 reults
final_results_level3 = final_results[final_results['level'] == 'level3'].reset_index(drop=True)
final_results_level3

Unnamed: 0,method,level,f1_weighted_mean,accuracy_mean,macro_f1_mean,mcc_mean,kappa_mean,r2_mean,pearson_corr_mean,ari_mean,...,accuracy_std,macro_f1_std,mcc_std,kappa_std,r2_std,pearson_corr_std,ari_std,nmi_std,stability,overall_score
0,xgboost_default_StratifiedGroupKFold,level3,0.925801,0.924738,0.883379,0.901992,0.90147,0.999103,0.999552,0.873386,...,0.003794,0.005645,0.00507,0.005228,0.000287,0.000144,0.010562,0.006439,0.963755,0.917621
1,random_forest_default_StratifiedGroupKFold,level3,0.915094,0.916437,0.85949,0.889146,0.888939,0.998561,0.99928,0.857571,...,0.004398,0.005829,0.005101,0.005069,0.000209,0.000104,0.011926,0.00696,0.960451,0.90688
2,maps,level3,0.903755,0.901276,0.847469,0.872484,0.871502,0.998294,0.999147,0.839596,...,0.005303,0.004744,0.005682,0.005957,0.000561,0.000281,0.014904,0.006992,0.945783,0.894065
3,logistic_regression_default_StratifiedGroupKFold,level3,0.839154,0.827922,0.746408,0.783474,0.780206,0.992853,0.996419,0.735522,...,0.018005,0.024395,0.019031,0.020216,0.003463,0.001738,0.038141,0.020571,0.846772,0.819546
4,scyan,level3,0.74946,0.749325,0.587866,0.676482,0.673831,0.971875,0.985816,0.696248,...,0.015039,0.010446,0.013394,0.014832,0.013773,0.007017,0.02405,0.00445,0.936944,0.757059
5,flowsom,level3,0.743259,0.758169,0.538635,0.674506,0.673215,0.993109,0.996547,0.708362,...,0.004397,0.015023,0.006314,0.00605,0.003714,0.001866,0.005413,0.006076,0.943666,0.754401
6,leiden_res2,level3,0.628372,0.665426,0.419831,0.538497,0.532509,0.982349,0.991134,0.564658,...,0.005516,0.030555,0.007083,0.006776,0.002864,0.001445,0.007831,0.005949,0.922168,0.662634
7,tacit,level3,0.656133,0.612613,0.475797,0.528962,0.519499,0.884568,0.940515,0.507747,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.656863
8,phenograph_40,level3,0.614089,0.660184,0.391597,0.529717,0.522106,0.977293,0.988578,0.549899,...,0.007198,0.016487,0.009574,0.009525,0.005046,0.002554,0.006582,0.00853,0.897178,0.651556
9,phenograph_20,level3,0.605839,0.648504,0.36493,0.515124,0.509362,0.98029,0.990095,0.542844,...,0.008962,0.028197,0.011772,0.01169,0.002245,0.001135,0.009124,0.009172,0.905893,0.642574


In [23]:
grouped_results = final_results.groupby('level').apply(lambda x: x.sort_values(by='overall_score', ascending=False)).reset_index(drop=True)
grouped_results

  grouped_results = final_results.groupby('level').apply(lambda x: x.sort_values(by='overall_score', ascending=False)).reset_index(drop=True)


Unnamed: 0,method,level,f1_weighted_mean,accuracy_mean,macro_f1_mean,mcc_mean,kappa_mean,r2_mean,pearson_corr_mean,ari_mean,...,accuracy_std,macro_f1_std,mcc_std,kappa_std,r2_std,pearson_corr_std,ari_std,nmi_std,stability,overall_score
0,xgboost_default_StratifiedGroupKFold,level1,0.959455,0.955599,0.84094,0.812676,0.804535,0.997234,0.998613,0.770009,...,0.026274,0.101135,0.043406,0.051643,0.005715,0.002867,0.06407,0.046734,0.742051,0.854905
1,random_forest_default_StratifiedGroupKFold,level1,0.940625,0.945989,0.741748,0.71889,0.697636,0.997294,0.998643,0.66303,...,0.037426,0.097714,0.056373,0.069886,0.005051,0.002533,0.084972,0.070196,0.569466,0.782415
2,leiden_res2,level1,0.881192,0.902043,0.555842,0.472101,0.437085,0.999178,0.999589,0.404206,...,0.003889,0.026968,0.022489,0.030821,0.000432,0.000216,0.031114,0.015605,0.942528,0.688866
3,phenograph_30,level1,0.880967,0.903861,0.542004,0.476354,0.432703,0.999185,0.999592,0.399048,...,0.000458,0.016666,0.013881,0.036413,0.000548,0.000274,0.03497,0.009059,0.952745,0.688362
4,phenograph_40,level1,0.881019,0.90314,0.542333,0.476564,0.43698,0.999161,0.99958,0.402862,...,0.002033,0.02418,0.032452,0.056932,0.000462,0.000231,0.054569,0.024687,0.927185,0.686543
5,phenograph_20,level1,0.879243,0.903692,0.539546,0.470175,0.418327,0.998907,0.999454,0.384506,...,0.00135,0.021572,0.02287,0.045387,0.00055,0.000275,0.042555,0.016875,0.938938,0.682622
6,phenograph_80,level1,0.875409,0.903369,0.518573,0.457186,0.384473,0.998685,0.999342,0.352548,...,0.000721,0.010755,0.009673,0.023716,0.000351,0.000175,0.022497,0.006399,0.967184,0.673743
7,leiden_res1,level1,0.870579,0.899762,0.509973,0.438002,0.371998,0.998395,0.999197,0.34809,...,0.002905,0.031116,0.028853,0.059974,0.00088,0.00044,0.058259,0.017459,0.909476,0.661606
8,leiden_res0_8,level1,0.868387,0.898859,0.504725,0.427545,0.356667,0.998243,0.999121,0.328118,...,0.001687,0.030989,0.030063,0.061321,0.000825,0.000413,0.058062,0.018999,0.911341,0.655672
9,leiden_res0_5,level1,0.85739,0.89631,0.466193,0.39292,0.286865,0.997337,0.998667,0.263389,...,0.002151,0.014461,0.01967,0.02534,0.000303,0.000152,0.022731,0.012649,0.949965,0.634998


In [24]:
# Save the final results to a CSV file with ';' as the separator
grouped_results.to_csv(os.path.join(base_path, "final_results.csv"), index=False, sep=';')