In [1]:
# import the following libraries
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
from scipy.stats import kruskal, mannwhitneyu
from itertools import combinations

# i have 3 types of methods
manual_stuff = {"FREEHAND", "OVAL"}
kinda_auto = {"SEMI", "SAMJ", "ILASTIK", "60%"}
full_auto = {"PORED2", "UNET", "OTSU", "PHANSALKAR"}

# pretty colors for plots
blue = "#9ecae1" 
purple = "#d0b7ff"
red = "#f7b6b6"
colors = [blue, purple, red]

import os
from pathlib import Path
notebook_dir = Path.cwd()
proj_root = notebook_dir.parent
org_dir = str(proj_root)

# these are all my methods
methods = ['60%', 'FREEHAND', 'ILASTIK', 'OTSU', 'OVAL', 'PHANSALKAR', 'PORED2', 'SAMJ', 'SEMI', 'UNET']

# my folders with data (copied from file explorer)
folders = [
    os.path.join(org_dir ,"AFM Accuracy INTERNAL/AFM 1% INTERNAL"),
    os.path.join(org_dir ,"AFM Accuracy INTERNAL/AFM 1.5% INTERNAL"), 
    os.path.join(org_dir ,"AFM Accuracy INTERNAL/AFM 2% INTERNAL"),
    os.path.join(org_dir ,"Confocal Accuracy INTERNAL/Confocal 0.375%"),
    os.path.join(org_dir ,"Confocal Accuracy INTERNAL/Confocal 1%"),
    os.path.join(org_dir ,"STED Accuracy INTERNAL/Internal 0.375%"),
    os.path.join(org_dir , "STED Accuracy INTERNAL/Internal 1%"),
    os.path.join(org_dir , "CRYO-SEM Accuracy INTERNAL/CRYO-SEM X3000/CRYO-SEM X3000"),
    os.path.join(org_dir , "CRYO-SEM Accuracy INTERNAL/CRYO-SEM X10000"),
    os.path.join(org_dir , "CRYO-SEM Accuracy INTERNAL/CRYO-SEM X30000"),
    os.path.join(org_dir , "CRYO-SEM Accuracy INTERNAL/CRYO-SEM X60000"),
]

# check if its a metric file
def is_metric_file(fname):
    # my files look like "Metric Results [METHOD].csv"
    return fname.startswith("Metric Results [") and fname.endswith("].csv")

# see if folder has replicate numbers like [1] [2] etc
def has_rep_num(path):
    for part in path.parts:
        if "[1]" in part or "[2]" in part or "[3]" in part or "[4]" in part:
            return True
    return False

# afm 1.5% is weird and doesnt have rep numbers
def is_afm_weird_case(path):
    path_str = str(path).upper()
    if "AFM" in path_str and "1.5%" in path_str and "ACCURACY" in path.name.upper():
        return True
    return False

# confocal and sted sometimes dont have reps either
def is_single_ok(path):
    name = path.name.upper()
    if ("CONFOCAL" in name or "STED" in name) and name.endswith("ACCURACY"):
        return True
    return False

# figure out what kind of data this is
def get_info_from_path(path):
    parts = str(path).upper()
    
    # what technique
    if "AFM" in parts:
        tech = "AFM"
        if "1.5%" in parts:
            conc = "1.5%"
            col = "mean"
        elif "1%" in parts and "1.5%" not in parts:
            conc = "1%"  
            col = "value"
        elif "2%" in parts:
            conc = "2%"
            col = "value"
        else:
            conc = "unknown"
            col = "mean"
            
    elif "STED" in parts:
        tech = "STED"
        col = "mean"
        if "0.375%" in parts:
            conc = "0.375%"
        elif "1%" in parts:
            conc = "1%"
        else:
            conc = "unknown"
            
    elif "CONFOCAL" in parts:
        tech = "CONFOCAL" 
        col = "mean"
        if "0.375%" in parts:
            conc = "0.375%"
        elif "1%" in parts:
            conc = "1%"
        else:
            conc = "unknown"
            
    elif "CRYO" in parts and "SEM" in parts:
        tech = "CRYO-SEM"
        col = "value"
        if "X60000" in parts:
            conc = "X60000"
        elif "X30000" in parts:
            conc = "X30000" 
        elif "X10000" in parts:
            conc = "X10000"
        elif "X3000" in parts:
            conc = "X3000"
        else:
            conc = "unknown"
    else:
        tech = "unknown"
        conc = "unknown"
        col = "mean"
        
    return tech, conc, col

# get replicate number from path
def get_rep_num(path):
    path_str = str(path)
    # look for [1], [2] etc
    if "[1]" in path_str:
        return 1
    elif "[2]" in path_str:
        return 2  
    elif "[3]" in path_str:
        return 3
    elif "[4]" in path_str:
        return 4
    else:
        return 1  # default

# find all folders with data
all_dirs = []
for folder in folders:
    if not os.path.exists(folder):
        print("cant find: " + folder)
        continue
        
    # look through subfolders 
    for root, dirs, files in os.walk(folder):
        for f in files:
            if is_metric_file(f):
                all_dirs.append(Path(root))
                break

# only keep good folders
good_dirs = []
for d in all_dirs:
    if has_rep_num(d) or is_afm_weird_case(d) or is_single_ok(d):
        good_dirs.append(d)

# remove duplicate 'Accuracy' folders if there are specific ones
final_dirs = []
parent_groups = {}
for d in good_dirs:
    parent = d.parent
    if parent not in parent_groups:
        parent_groups[parent] = []
    parent_groups[parent].append(d)

for parent, dir_list in parent_groups.items():
    if len(dir_list) > 1:
        specific = [d for d in dir_list if d.name.lower() != "accuracy"]
        if specific:
            final_dirs.extend(specific)
        else:
            final_dirs.extend(dir_list)
    else:
        final_dirs.extend(dir_list)

dataset_folders = sorted(set(final_dirs))
print("found " + str(len(dataset_folders)) + " folders")

# read all the data
all_data = []

for folder in dataset_folders:
    tech, conc, col = get_info_from_path(folder)
    rep = get_rep_num(folder)
    
    print("doing: " + str(folder))
    
    # check each method
    for method in methods:
        file_path = folder / ("Metric Results [" + method + "].csv")
        if not file_path.exists():
            continue
            
        try:
            df = pd.read_csv(file_path)
            if 'metric' not in df.columns:
                continue
                
            # find different metrics
            acc_data = df[df['metric'] == 'accuracy']
            prec_data = df[df['metric'] == 'precision'] 
            rec_data = df[df['metric'] == 'recall']
            f1_data = df[df['metric'] == 'f1_dice']
            iou_data = df[df['metric'] == 'iou_jaccard']
            spec_data = df[df['metric'] == 'specificity']
            bal_data = df[df['metric'] == 'balanced_accuracy']
            
            if len(acc_data) > 0:
                acc_val = acc_data.iloc[0].get(col, None)
                
                row = {}
                row['technique'] = tech
                row['concentration'] = conc  
                row['replicate'] = rep
                row['method'] = method
                
                # accuracy 
                if acc_val is not None and not pd.isna(acc_val):
                    row['accuracy'] = str(round(float(acc_val), 2))
                else:
                    row['accuracy'] = None
                    
                # confidence intervals
                if 'ci95_low' in acc_data.columns:
                    row['accuracy_ci95_low'] = acc_data.iloc[0]['ci95_low']
                else:
                    row['accuracy_ci95_low'] = None
                if 'ci95_high' in acc_data.columns:
                    row['accuracy_ci95_high'] = acc_data.iloc[0]['ci95_high'] 
                else:
                    row['accuracy_ci95_high'] = None
                    
                # precision
                if len(prec_data) > 0 and col in prec_data.columns:
                    row['precision'] = prec_data.iloc[0][col]
                    if 'ci95_low' in prec_data.columns:
                        row['precision_ci95_low'] = prec_data.iloc[0]['ci95_low']
                    else:
                        row['precision_ci95_low'] = None
                    if 'ci95_high' in prec_data.columns:
                        row['precision_ci95_high'] = prec_data.iloc[0]['ci95_high']
                    else:
                        row['precision_ci95_high'] = None
                else:
                    row['precision'] = None
                    row['precision_ci95_low'] = None
                    row['precision_ci95_high'] = None
                    
                # recall
                if len(rec_data) > 0 and col in rec_data.columns:
                    row['recall'] = rec_data.iloc[0][col]
                    if 'ci95_low' in rec_data.columns:
                        row['recall_ci95_low'] = rec_data.iloc[0]['ci95_low']
                    else:
                        row['recall_ci95_low'] = None
                    if 'ci95_high' in rec_data.columns:
                        row['recall_ci95_high'] = rec_data.iloc[0]['ci95_high']
                    else:
                        row['recall_ci95_high'] = None
                else:
                    row['recall'] = None
                    row['recall_ci95_low'] = None  
                    row['recall_ci95_high'] = None
                    
                # other stuff
                if len(spec_data) > 0 and col in spec_data.columns:
                    row['specificity'] = spec_data.iloc[0][col]
                else:
                    row['specificity'] = None
                    
                if len(bal_data) > 0 and col in bal_data.columns:
                    row['balanced_accuracy'] = bal_data.iloc[0][col]
                else:
                    row['balanced_accuracy'] = None
                    
                # f1 score
                if len(f1_data) > 0 and col in f1_data.columns:
                    row['f1_dice'] = f1_data.iloc[0][col]
                    if 'ci95_low' in f1_data.columns:
                        row['f1_dice_ci95_low'] = f1_data.iloc[0]['ci95_low']
                    else:
                        row['f1_dice_ci95_low'] = None
                    if 'ci95_high' in f1_data.columns:
                        row['f1_dice_ci95_high'] = f1_data.iloc[0]['ci95_high']
                    else:
                        row['f1_dice_ci95_high'] = None
                else:
                    row['f1_dice'] = None
                    row['f1_dice_ci95_low'] = None
                    row['f1_dice_ci95_high'] = None
                    
                # iou 
                if len(iou_data) > 0 and col in iou_data.columns:
                    row['iou_jaccard'] = iou_data.iloc[0][col]
                    if 'ci95_low' in iou_data.columns:
                        row['iou_jaccard_ci95_low'] = iou_data.iloc[0]['ci95_low']
                    else:
                        row['iou_jaccard_ci95_low'] = None
                    if 'ci95_high' in iou_data.columns:
                        row['iou_jaccard_ci95_high'] = iou_data.iloc[0]['ci95_high']
                    else:
                        row['iou_jaccard_ci95_high'] = None
                else:
                    row['iou_jaccard'] = None
                    row['iou_jaccard_ci95_low'] = None
                    row['iou_jaccard_ci95_high'] = None
                    
                all_data.append(row)
                
        except Exception as e:
            print("error with " + str(file_path) + ": " + str(e))

# make dataframe
results = pd.DataFrame(all_data)
print("results shape: " + str(results.shape))

# show some data
cols = ['technique','concentration','replicate','method','accuracy','f1_dice','iou_jaccard','precision','recall','specificity','balanced_accuracy']
print(results[cols].head(12))

cant find: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\STED Accuracy INTERNAL/Internal 0.375%
cant find: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\STED Accuracy INTERNAL/Internal 1%
cant find: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\CRYO-SEM Accuracy INTERNAL/CRYO-SEM X3000/CRYO-SEM X3000
cant find: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\CRYO-SEM Accuracy INTERNAL/CRYO-SEM X10000
cant find: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\CRYO-SEM Accuracy INTERNAL/CRYO-SEM X30000
cant find: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\CRYO-SEM Accuracy INTERNAL/CRYO-SEM X60000
found 10 folders
doing: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\AFM Accuracy INTERNAL\AFM 1% INTERNAL\AFM 1% [1]\AFM 1% [1] Accuracy
doing: c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\AFM Accuracy INTERNAL\AFM 1% I

In [2]:
# fix the number columns so they work properly
number_columns = ['accuracy', 'precision', 'recall', 'specificity', 'iou_jaccard']
for column in number_columns:
    results[column] = pd.to_numeric(results[column], errors='coerce')

# get rid of the ci95 columns since theyre all empty anyway
columns_to_remove = []
for column_name in results.columns:
    if 'ci95' in column_name.lower():
        columns_to_remove.append(column_name)

results = results.drop(columns=columns_to_remove)

# make pandas show more stuff when i print
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200) 
pd.set_option('display.width', 200)
pd.set_option('display.colheader_justify', 'center')

# the columns i want to look at
important_cols = ['technique','concentration','replicate','method','accuracy','precision','recall','specificity','balanced_accuracy','f1_dice','iou_jaccard']

# show me the first bunch of rows
results[important_cols].head(50)

Unnamed: 0,technique,concentration,replicate,method,accuracy,precision,recall,specificity,balanced_accuracy,f1_dice,iou_jaccard
0,AFM,1%,1,60%,0.61,0.354316,0.996642,0.506276,0.751459,0.522779,0.353894
1,AFM,1%,1,FREEHAND,0.93,0.821292,0.839553,0.95034,0.894946,0.830322,0.709872
2,AFM,1%,1,ILASTIK,0.92,0.891885,0.687589,0.977342,0.832465,0.776524,0.634687
3,AFM,1%,1,OTSU,0.7,0.410378,0.888336,0.653038,0.770687,0.561407,0.390248
4,AFM,1%,1,OVAL,0.92,0.821159,0.776845,0.954007,0.865426,0.798387,0.66443
5,AFM,1%,1,PORED2,0.8,0.532751,0.342839,0.918261,0.63055,0.4172,0.263583
6,AFM,1%,1,SAMJ,0.85,0.834425,0.361541,0.980498,0.67102,0.504494,0.33734
7,AFM,1%,1,SEMI,0.91,0.755655,0.842332,0.925958,0.884145,0.796643,0.662017
8,AFM,1%,1,UNET,0.73,0.366711,0.365044,0.828628,0.596836,0.365876,0.223897
9,AFM,1%,2,60%,0.63,0.395023,0.962829,0.515877,0.739353,0.560208,0.38909


In [3]:
# just check what data i got
def check_my_data(data):
    print("checking data:")
    print("total rows: " + str(len(data)))
    print("how many replicates: " + str(data['replicate'].nunique()))
    print("how many methods: " + str(data['method'].nunique()))
    
    # count methods
    counts = data.groupby('method').size()
    print("counts per method:")
    print(counts)

# metrics i care about
metrics_list = ['accuracy', 'precision', 'recall', 'specificity', 'iou_jaccard']

# kruskal wallis test function
def kruskal_test(data, metric):
    # get groups for each method
    method_groups = []
    for method_name in data['method'].unique():
        method_data = data[data['method'] == method_name]
        values = method_data[metric].dropna().values
        if len(values) > 0:
            method_groups.append(values)
    
    # need at least 2 groups
    if len(method_groups) < 2:
        return None, None
    
    # check if all groups are identical
    same_values = True
    for group in method_groups:
        if group.std() != 0:
            same_values = False
    
    if same_values:
        print("  warning: all groups have same values")
        return None, None
    
    # do the test
    h_value, p_value = kruskal(*method_groups)
    return h_value, p_value

# holm correction for multiple testing
def holm_fix(p_values):
    # make array
    p_arr = np.array(p_values)
    corrected = np.full(len(p_arr), np.nan)
    
    # find valid p values
    valid_idx = []
    for i in range(len(p_arr)):
        if not np.isnan(p_arr[i]):
            valid_idx.append(i)
    
    if len(valid_idx) == 0:
        return corrected
    
    # sort by p value
    sorted_idx = []
    for idx in valid_idx:
        sorted_idx.append((p_arr[idx], idx))
    sorted_idx.sort()
    
    # apply holm correction
    max_so_far = 0.0
    for rank, (p_val, orig_idx) in enumerate(sorted_idx):
        m = len(valid_idx)
        adjusted = min(1.0, (m - rank) * p_val)
        max_so_far = max(max_so_far, adjusted)
        corrected[orig_idx] = max_so_far
    
    return corrected

# pairwise mann whitney tests
def pairwise_tests(data, metric):
    methods = data['method'].unique()
    results = []
    
    # compare all pairs
    for i in range(len(methods)):
        for j in range(i+1, len(methods)):
            method1 = methods[i]
            method2 = methods[j]
            
            # get data for each method
            data1 = data[data['method'] == method1][metric].dropna().values
            data2 = data[data['method'] == method2][metric].dropna().values
            
            if len(data1) > 0 and len(data2) > 0:
                try:
                    u_stat, p_val = mannwhitneyu(data1, data2, alternative='two-sided')
                    
                    result = {}
                    result['method1'] = method1
                    result['method2'] = method2
                    result['n1'] = len(data1)
                    result['n2'] = len(data2)
                    result['median1'] = np.median(data1)
                    result['median2'] = np.median(data2)
                    result['u_statistic'] = u_stat
                    result['p_value'] = float(p_val)
                    
                    results.append(result)
                except:
                    print("  error comparing " + method1 + " and " + method2)
    
    # make dataframe
    results_df = pd.DataFrame(results)
    if len(results_df) > 0:
        # apply holm correction
        p_vals = results_df['p_value'].values
        corrected_p = holm_fix(p_vals)
        results_df['p_holm'] = corrected_p
        
        # check significance
        results_df['sig_raw'] = results_df['p_value'].apply(lambda x: "Yes" if x < 0.05 else "No")
        
        sig_holm = []
        for p in results_df['p_holm']:
            if pd.notna(p) and p < 0.05:
                sig_holm.append("Yes")
            else:
                sig_holm.append("No")
        results_df['sig_holm'] = sig_holm
    
    return results_df

# calculate summary statistics
def summary_stats(data, metric):
    summary = data.groupby('method')[metric].agg(['count', 'mean', 'median', 'std', 'min', 'max']).round(4)
    return summary

# conditions to test
test_conditions = [
    ('CRYO-SEM', 'X3000'),
    ('CRYO-SEM', 'X10000'),
    ('CRYO-SEM', 'X30000'),
    ('CRYO-SEM', 'X60000'),
    ('AFM', '1%'),
    ('AFM', '2%'),
    ('AFM', '1.5%'),
    ('STED', '0.375%'),
    ('STED', '1%'),
    ('CONFOCAL', '0.375%'),
    ('CONFOCAL', '1%'),
]

# go through each condition
for technique, concentration in test_conditions:
    # filter data
    filtered_data = results[(results['technique'] == technique) & (results['concentration'] == concentration)]
    
    print("")
    print("=" * 80)
    print(technique + " " + concentration)
    print("=" * 80)
    check_my_data(filtered_data)
    
    # test each metric
    for metric in metrics_list:
        print("")
        print("=" * 60)
        print(metric.upper() + " analysis")
        print("=" * 60)
        
        print("")
        print("summary stats:")
        stats = summary_stats(filtered_data, metric)
        print(stats)
        
        # check if enough data
        if stats['count'].min() < 2:
            print("")
            print("  warning: some methods have less than 2 observations")
            print("  skipping statistical tests")
            continue
        
        # kruskal wallis test
        h_stat, p_val = kruskal_test(filtered_data, metric)
        if h_stat is not None:
            print("")
            print("kruskal wallis test:")
            print("  h statistic: " + str(round(h_stat, 4)))
            print("  p value: " + str(round(p_val, 4)))
            print("  methods tested: " + str(len(filtered_data['method'].unique())))
            
            if p_val < 0.05:
                print("  result: significant difference found")
            else:
                print("  result: no significant difference")
            
            # post hoc tests if significant
            if p_val < 0.05 and len(filtered_data['method'].unique()) > 2:
                print("")
                print("pairwise comparisons:")
                pairwise_results = pairwise_tests(filtered_data, metric)
                if len(pairwise_results) > 0:
                    print(pairwise_results.to_string(index=False))
        else:
            print("")
            print("  cannot do statistical test")

print("")
print("=" * 80)
print("finished all tests")
print("=" * 80)


CRYO-SEM X3000
checking data:
total rows: 0
how many replicates: 0
how many methods: 0
counts per method:
Series([], dtype: int64)

ACCURACY analysis

summary stats:
Empty DataFrame
Columns: [count, mean, median, std, min, max]
Index: []

  cannot do statistical test

PRECISION analysis

summary stats:
Empty DataFrame
Columns: [count, mean, median, std, min, max]
Index: []

  cannot do statistical test

RECALL analysis

summary stats:
Empty DataFrame
Columns: [count, mean, median, std, min, max]
Index: []

  cannot do statistical test

SPECIFICITY analysis

summary stats:
Empty DataFrame
Columns: [count, mean, median, std, min, max]
Index: []

  cannot do statistical test

IOU_JACCARD analysis

summary stats:
Empty DataFrame
Columns: [count, mean, median, std, min, max]
Index: []

  cannot do statistical test

CRYO-SEM X10000
checking data:
total rows: 0
how many replicates: 0
how many methods: 0
counts per method:
Series([], dtype: int64)

ACCURACY analysis

summary stats:
Empty Data