In [2]:
# Import libraries
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from itertools import combinations
import numpy as np
from pathlib import Path
import re

import os
from pathlib import Path
notebook_dir = Path.cwd()
proj_root = notebook_dir.parent
org_dir = str(proj_root)

"""
This script loads all my pore size analysis results from different folders
I have data from AFM, CRYO-SEM, STED, and CONFOCAL experiments
Each has different concentrations and I need to combine everything
"""

# method groups i need to categorize the segmentation approaches
traditional_methods = ["FREEHAND", "OVAL"]
semi_auto_methods = ["SEMI", "SAMJ", "ILASTIK", "60%"]
full_auto_methods = ["PORED2", "UNET", "OTSU", "PHANSALKAR"]

def clean_method_name(name):
    """
    Clean up method names from filenames to standardize them
    Sometimes the filenames have extra stuff so need to extract the actual method
    """
    if name is None or not isinstance(name, str):
        return "UNKNOWN"
    
    clean_name = name.strip()
    upper_name = clean_name.upper()
    
    # special case for 60% because it has a percentage sign
    if "60" in upper_name and "%" in upper_name:
        return "60%"
    
    # check against my list of known methods
    known_methods = ["FREEHAND", "OVAL", "SEMI", "SAMJ", "ILASTIK", "OTSU", "PHANSALKAR", "PORED2", "UNET", "PLAN"]
    for method in known_methods:
        if upper_name.startswith(method):
            return method
    
    # if nothing matches try to get something meaningful
    match = re.search(r"[A-Z0-9%]+", upper_name)
    if match:
        return match.group(0)
    else:
        return clean_name

def get_method_group(method):
    """
    Put each method into one of my three categories
    This helps with the statistical analysis later
    """
    clean_method = clean_method_name(method)
    
    if clean_method in traditional_methods:
        return "TRADITIONAL"
    elif clean_method in semi_auto_methods:
        return "SEMI_AUTO"
    elif clean_method in full_auto_methods:
        return "FULL_AUTO"
    else:
        return "OTHER"

# these are all the folders where i stored my experimental data
# each one has a specific technique and concentration
data_roots = [
    os.path.join(org_dir ,"AFM Accuracy INTERNAL/AFM 1% INTERNAL"),
    os.path.join(org_dir ,"AFM Accuracy INTERNAL/AFM 1.5% INTERNAL"),
    os.path.join(org_dir ,"AFM Accuracy INTERNAL/AFM 2% INTERNAL"),
    os.path.join(org_dir ,"Confocal Accuracy INTERNAL/Confocal 0.375%"),
    os.path.join(org_dir ,"Confocal Accuracy INTERNAL/Confocal 1%"),
    os.path.join(org_dir ,"STED Accuracy INTERNAL/Internal 0.375%"),
    os.path.join(org_dir ,"STED Accuracy INTERNAL/Internal 1%"),
    os.path.join(org_dir ,"CRYO-SEM Accuracy INTERNAL/CRYO-SEM X3000/CRYO-SEM X3000"),
    os.path.join(org_dir ,"CRYO-SEM Accuracy INTERNAL/CRYO-SEM X10000"),
    os.path.join(org_dir ,"CRYO-SEM Accuracy INTERNAL/CRYO-SEM X30000"),
    os.path.join(org_dir ,"CRYO-SEM Accuracy INTERNAL/CRYO-SEM X60000"),
]

def get_technique(path_string):
    """
    Extract which imaging technique was used from the folder path
    Just looking for keywords in the path name
    """
    path_upper = path_string.upper()
    
    if "AFM" in path_upper:
        return "AFM"
    elif "CONFOCAL" in path_upper:
        return "CONFOCAL"
    elif "STED" in path_upper:
        return "STED"
    elif "CRYO-SEM" in path_upper or ("CRYO" in path_upper and "SEM" in path_upper):
        return "CRYO-SEM"
    else:
        return "UNKNOWN"

def get_concentration(path_string, technique):
    """
    Get the concentration or magnification from the path
    Different techniques use different units so need to handle separately
    """
    path_upper = path_string.upper()
    
    # for AFM, STED, CONFOCAL we have percentage concentrations
    if technique in ["AFM", "STED", "CONFOCAL"]:
        concentrations = ["0.375%", "1.5%", "1%", "2%"]
        for conc in concentrations:
            if conc in path_upper:
                return conc
        return "unknown"
    # for CRYO-SEM we have magnifications
    elif technique == "CRYO-SEM":
        magnifications = ["X60000", "X30000", "X10000", "X3000"]
        for mag in magnifications:
            if mag in path_upper:
                return mag
        return "unknown"
    else:
        return "unknown"

def get_replicate_number(path):
    """
    Find replicate numbers in folder names
    They are usually in brackets like [1], [2], [3]
    """
    path_str = str(path)
    # look for pattern like [1], [2], etc
    replicate_match = re.search(r"\[(\d+)\]", path_str)
    if replicate_match:
        try:
            return int(replicate_match.group(1))
        except:
            return 1  # default to 1 if something goes wrong
    return 1

def find_pore_size_folders(root_path):
    """
    Search for all folders named exactly PORE SIZE RESULTS
    This is where i put all my analysis outputs
    """
    result_folders = []
    for folder in root_path.rglob("PORE SIZE RESULTS"):
        result_folders.append(folder)
    return result_folders

def load_all_pore_data():
    """
    Main function that goes through all my data folders
    Finds the CSV files and loads them with proper metadata
    This takes a while because there are lots of files
    """
    all_data = []
    
    # go through each root folder
    for root_str in data_roots:
        root_path = Path(root_str)
        
        # check if folder actually exists
        if not root_path.exists():
            print("warning: path does not exist: " + str(root_path))
            continue
        
        # find all the PORE SIZE RESULTS folders
        pore_folders = find_pore_size_folders(root_path)
        print("found " + str(len(pore_folders)) + " PORE SIZE RESULTS folders in " + str(root_path))
        
        # process each folder
        for folder in pore_folders:
            path_str = str(folder)
            technique = get_technique(path_str)
            concentration = get_concentration(path_str, technique)
            replicate = get_replicate_number(folder)
            
            # find all csv files in this folder
            csv_files = []
            for csv_file in folder.glob("*.csv"):
                csv_files.append(csv_file)
            csv_files.sort()  # keep them in order
            
            print("  " + technique + " " + concentration + " replicate=" + str(replicate) + ": " + str(len(csv_files)) + " csv files")
            
            # load each csv file
            for csv_path in csv_files:
                try:
                    # read the csv data
                    data = pd.read_csv(csv_path)
                except Exception as error:
                    print("error reading " + str(csv_path) + ": " + str(error))
                    continue
                
                # make a copy so we don't mess up the original
                data_copy = data.copy()
                
                # add all the metadata columns
                data_copy["technique"] = technique
                data_copy["concentration"] = concentration
                data_copy["replicate"] = replicate
                data_copy["file_source"] = csv_path.name
                
                # extract method name from the filename
                # usually the first part before any spaces
                filename_parts = csv_path.stem.split(" ")
                method_name = filename_parts[0]
                data_copy["method"] = clean_method_name(method_name)
                
                # assign method group for each row
                method_groups = []
                for method in data_copy["method"]:
                    method_groups.append(get_method_group(method))
                data_copy["method_group"] = method_groups
                
                # add this dataframe to our collection
                all_data.append(data_copy)
    
    # check if we actually loaded anything
    if len(all_data) == 0:
        print("warning: no csv files found")
        return pd.DataFrame()
    
    # combine all dataframes into one big one
    combined_data = pd.concat(all_data, ignore_index=True, sort=False)
    
    # reorganize columns so metadata comes first
    metadata_cols = ["technique", "concentration", "replicate", "method_group", "method", "file_source"]
    other_cols = []
    for col in combined_data.columns:
        if col not in metadata_cols:
            other_cols.append(col)
    
    final_cols = metadata_cols + other_cols
    return combined_data[final_cols]

# actually run the data loading process
print("loading all pore size data...")
results = load_all_pore_data()

print("")
print("combined pore size results:")
print("shape: " + str(results.shape))
print(results.head(12))

# do some basic checks on what we loaded
if len(results) > 0:
    print("")
    print("file counts by technique/concentration/method:")
    # group by the key variables and count unique files
    file_counts = results.groupby(["technique", "concentration", "method"])["file_source"].nunique().reset_index()
    file_counts.columns = ["technique", "concentration", "method", "file_count"]
    print(file_counts.head(25))

# save everything to a single csv file
try:
    # try to save near the first root path if it exists
    output_dir = Path(data_roots[0]) if Path(data_roots[0]).exists() else Path(org_dir)
    output_file = output_dir / "pore_size_results_all.csv"
    results.to_csv(output_file, index=False, encoding="utf-8")
    print("")
    print("saved to: " + str(output_file))
except Exception as error:
    print("error saving file: " + str(error))

loading all pore size data...
found 8 PORE SIZE RESULTS folders in c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\AFM Accuracy INTERNAL\AFM 1% INTERNAL
  AFM 1% replicate=1: 11 csv files
  AFM 1% replicate=1: 0 csv files
  AFM 1% replicate=1: 0 csv files
  AFM 1% replicate=1: 0 csv files
  AFM 1% replicate=2: 11 csv files
  AFM 1% replicate=3: 11 csv files
  AFM 1% replicate=4: 11 csv files
  AFM 1% replicate=5: 11 csv files
found 1 PORE SIZE RESULTS folders in c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\AFM Accuracy INTERNAL\AFM 1.5% INTERNAL
  AFM 1.5% replicate=1: 11 csv files
found 3 PORE SIZE RESULTS folders in c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\AFM Accuracy INTERNAL\AFM 2% INTERNAL
  AFM 2% replicate=1: 11 csv files
  AFM 2% replicate=2: 10 csv files
  AFM 2% replicate=3: 11 csv files
found 1 PORE SIZE RESULTS folders in c:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\Confocal Accura

In [2]:
# use the combined dataframe we made in the previous cell
# making a copy so i dont accidentally change the original data
combined_pore_size_df = results.copy()

def assign_family(method_name):
    """
    This function takes a method name and puts it into one of the family categories
    I need this because i want to group methods by how much human input they need
    Traditional methods need lots of manual work
    Semi-automated methods need some human guidance
    Fully automated methods work on their own
    """
    # convert to string and make uppercase to be safe
    token = str(method_name).upper()
    
    # check which family this method belongs to
    if token in traditional_methods:
        return "Traditional"
    elif token in semi_auto_methods:
        return "Semi-automated"
    elif token in full_auto_methods:
        return "Fully automated"
    else:
        # if its not in any of my lists just call it unknown
        return "Unknown"

# add the family column to my dataframe
# this will help me compare different types of methods later
combined_pore_size_df['family'] = combined_pore_size_df['method'].apply(assign_family)

# take a look at what we got
combined_pore_size_df.head()

Unnamed: 0,technique,concentration,replicate,method_group,method,file_source,Method,AECD_um,SourceFile,Subfolder,family
0,AFM,1%,1,SEMI_AUTO,60%,60% AECD Results.csv,60%,11.269509,60% Results,.,Semi-automated
1,AFM,1%,1,TRADITIONAL,FREEHAND,FREEHAND AECD Results.csv,FREEHAND,1.134007,FREEHAND Results,.,Traditional
2,AFM,1%,1,TRADITIONAL,FREEHAND,FREEHAND AECD Results.csv,FREEHAND,1.43353,FREEHAND Results,.,Traditional
3,AFM,1%,1,TRADITIONAL,FREEHAND,FREEHAND AECD Results.csv,FREEHAND,0.816809,FREEHAND Results,.,Traditional
4,AFM,1%,1,TRADITIONAL,FREEHAND,FREEHAND AECD Results.csv,FREEHAND,0.46933,FREEHAND Results,.,Traditional


In [3]:
"""
This cell does the statistical analysis on my pore size data
I need to prepare the data first then run tests to compare methods
The main goal is to see if different segmentation methods give significantly different results
"""

# start with the combined dataframe from before
metric_df = combined_pore_size_df.copy()

# fix the units because some are in micrometers and i need nanometers
metric_df['AECD_nm'] = metric_df['AECD_um'] * 1000

# clean up method names because some have typos or extra spaces
# i noticed some inconsistencies when looking at the data
metric_df['method'] = metric_df['method'].str.upper()
# fix the common typos i found
metric_df['method'] = metric_df['method'].replace({'OAVL': 'OVAL', 'OAVL ': 'OVAL', 'OVL': 'OVAL'})
metric_df['method'] = metric_df['method'].str.strip()

# calculate median values for each replicate
# this is important because each replicate has multiple measurements
# and i want to compare replicates not individual measurements
replicate_medians = metric_df.groupby(['technique', 'concentration', 'method', 'replicate']).agg({
    'AECD_nm': 'median',
    'family': 'first'
}).reset_index()

# the metric i want to analyze
metrics = ['AECD_nm']

def print_data_info(df_subset):
    """
    Print basic information about the data subset
    This helps me understand what im working with for each condition
    """
    print("")
    print("data information:")
    print("total replicate medians: " + str(len(df_subset)))
    print("number of replicates: " + str(df_subset['replicate'].nunique()))
    print("number of methods: " + str(df_subset['method'].nunique()))
    print("")
    print("observations per method:")
    method_counts = df_subset.groupby('method').size()
    print(method_counts)

def perform_kruskal_wallis(df, metric, group_by='method'):
    """
    Do kruskal wallis test to compare multiple groups
    This is the nonparametric version of ANOVA which is good for my data
    Returns the test statistic and p value
    """
    # get data for each group
    groups = []
    for group_name, group_data in df.groupby(group_by):
        group_values = group_data[metric].dropna().values
        if len(group_values) > 0:
            groups.append(group_values)
    
    # need at least 2 groups to compare
    if len(groups) < 2:
        return None, None, None
    
    # check if all groups have no variation
    group_stds = []
    for group in groups:
        group_stds.append(group.std())
    
    if all(std == 0 for std in group_stds):
        print("  warning: all groups have zero variance")
        return None, None, len(groups)
    
    # do the actual test
    h_stat, p_value = kruskal(*groups)
    return h_stat, p_value, len(groups)

def calculate_summary_stats(df, metric, group_by='method'):
    """
    Calculate basic statistics for each group
    I need this to understand the data before doing tests
    """
    summary = df.groupby(group_by)[metric].agg([
        ('n', 'count'),
        ('mean', 'mean'),
        ('median', 'median'),
        ('std', 'std'),
        ('min', 'min'),
        ('max', 'max')
    ]).round(4)
    return summary

def holm_correction(p_values):
    """
    Apply holm correction for multiple testing
    This is more conservative than bonferroni but still controls family wise error
    I need this because im doing multiple pairwise comparisons
    """
    p_array = np.asarray(p_values, dtype=float)
    m = len(p_array)
    corrected = np.full(m, np.nan, dtype=float)
    
    # find finite p values
    finite_mask = np.isfinite(p_array)
    finite_vals = p_array[finite_mask]
    
    if len(finite_vals) == 0:
        return corrected
    
    # sort and apply correction
    sorted_idx = np.argsort(finite_vals)
    adj_vals = np.empty_like(finite_vals)
    running_max = 0.0
    
    for k, idx in enumerate(sorted_idx):
        adj_p = (len(finite_vals) - k) * finite_vals[idx]
        adj_p = min(adj_p, 1.0)
        running_max = max(running_max, adj_p)
        adj_vals[idx] = running_max
    
    corrected[finite_mask] = adj_vals
    return corrected

def perform_pairwise_mannwhitney(df, metric, group_by='method'):
    """
    Do all pairwise comparisons using mann whitney u test
    This is for post hoc analysis after significant kruskal wallis
    """
    group_names = df[group_by].dropna().unique()
    results = []
    
    # compare every pair of groups
    for g1, g2 in combinations(group_names, 2):
        data1 = df[df[group_by] == g1][metric].dropna().values
        data2 = df[df[group_by] == g2][metric].dropna().values
        
        if len(data1) > 0 and len(data2) > 0:
            try:
                stat, p_value = mannwhitneyu(data1, data2, alternative='two-sided')
                results.append({
                    'group1': g1,
                    'group2': g2,
                    'n1': len(data1),
                    'n2': len(data2),
                    'median1': np.median(data1),
                    'median2': np.median(data2),
                    'U_statistic': stat,
                    'p_value': p_value
                })
            except ValueError:
                # skip if test fails
                continue
    
    # make dataframe and add corrections
    df_results = pd.DataFrame(results)
    if len(df_results) > 0:
        df_results['p_value_holm'] = holm_correction(df_results['p_value'])
        
        # check significance
        sig_raw = []
        sig_holm = []
        for p in df_results['p_value']:
            if p < 0.05:
                sig_raw.append("Yes")
            else:
                sig_raw.append("No")
        
        for p in df_results['p_value_holm']:
            if p < 0.05:
                sig_holm.append("Yes")
            else:
                sig_holm.append("No")
        
        df_results['significant_raw'] = sig_raw
        df_results['significant_holm'] = sig_holm
    
    return df_results

# conditions i want to test
conditions = [
    ('CRYO-SEM', 'X3000'),
    ('CRYO-SEM', 'X10000'),
    ('CRYO-SEM', 'X30000'),
    ('CRYO-SEM', 'X60000'),
    ('AFM', '1%'),
    ('AFM', '2%'),
    ('AFM', '1.5%'),
    ('STED', '0.375%'),
    ('STED', '1%'),
    ('CONFOCAL', '0.375%'),
    ('CONFOCAL', '1%'),
]

# store all results for later
all_kw_results = []
all_pairwise_results = []

# go through each condition
for technique, concentration in conditions:
    # filter data for this condition
    df_subset = replicate_medians[
        (replicate_medians['technique'] == technique) &
        (replicate_medians['concentration'] == concentration)
    ]
    
    print("")
    print("")
    print("analyzing: " + technique + " " + concentration)
    print_data_info(df_subset)
    
    # analyze each metric
    for metric in metrics:
        print("")
        print("metric: " + metric.upper().replace('_', ' ') + " - comparing methods across replicate medians")
        
        print("")
        print("summary statistics by method:")
        summary = calculate_summary_stats(df_subset, metric)
        print(summary)
        
        # check if we have enough data
        if summary['n'].min() < 1:
            print("  warning: some methods have fewer than 1 observation")
            continue
        
        # do kruskal wallis test
        h_stat, p_value, n_groups = perform_kruskal_wallis(df_subset, metric)
        if h_stat is not None:
            print("")
            print("kruskal wallis test:")
            print("  h statistic: " + str(round(h_stat, 3)))
            print("  p value: " + str(p_value))
            print("  number of methods compared: " + str(n_groups))
            
            if p_value < 0.05:
                print("  result: significant difference found")
            else:
                print("  result: no significant difference")
            
            # store results
            all_kw_results.append({
                'technique': technique,
                'concentration': concentration,
                'metric': metric,
                'H_statistic': h_stat,
                'p_value': p_value,
                'n_methods': n_groups
            })
            
            # do post hoc tests if significant and more than 2 groups
            if p_value < 0.05 and n_groups > 2:
                print("")
                print("post hoc pairwise comparisons (mann whitney u with holm correction):")
                pairwise_results = perform_pairwise_mannwhitney(df_subset, metric)
                if len(pairwise_results) > 0:
                    # add family information
                    family_map = df_subset.set_index('method')['family'].to_dict()
                    pairwise_results['family1'] = pairwise_results['group1'].map(family_map)
                    pairwise_results['family2'] = pairwise_results['group2'].map(family_map)
                    pairwise_results.insert(0, 'technique', technique)
                    pairwise_results.insert(1, 'concentration', concentration)
                    pairwise_results.insert(2, 'metric', metric)
                    all_pairwise_results.append(pairwise_results)
                    print(pairwise_results.to_string(index=False))
        else:
            print("  cannot perform statistical test (insufficient variation or data)")

print("")
print("analysis complete")



analyzing: CRYO-SEM X3000

data information:
total replicate medians: 36
number of replicates: 4
number of methods: 9

observations per method:
method
60%         4
FREEHAND    4
GOLD        4
ILASTIK     4
OTSU        4
OVAL        4
PLAN        4
PORED2      4
UNET        4
dtype: int64

metric: AECD NM - comparing methods across replicate medians

summary statistics by method:
          n        mean      median        std       min         max
method                                                              
60%       4    216.0327    218.5048    31.7923  174.8077    252.3133
FREEHAND  4    641.4664    613.7326   240.9516  387.6110    950.7892
GOLD      4    469.1645    496.5085   199.1793  217.0481    666.5928
ILASTIK   4    200.6466    198.8224    52.6139  142.7299    262.2116
OTSU      4     91.8066     94.4070    14.8736   71.3650    107.0474
OVAL      4    801.5820    882.7241   325.3470  363.8913   1076.9884
PLAN      4     73.1864     75.5767     8.5645   61.8039     79