## **SPLIT CELLS SINGLE GOLD STANDARD**

In [None]:
# STED analysis script
# This script reads pore masks and measures how good each method is

import math
import os
import csv
import numpy as np
import cv2
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from scipy.stats import kruskal, wilcoxon

"""
Set up file paths and basic settings
We need to tell the script where to find the image files
"""
import os
from pathlib import Path
notebook_dir = Path.cwd()
proj_root = notebook_dir.parent
org_dir = str(proj_root)

# Where the original TIFF images are stored
#base_dir = r"C:\Users\walsh\Downloads\STED Accuracy INTERNAL\Internal 0.375%"
base_dir = os.path.join(org_dir , "[STED] Internal 0.375%")

# Where to save all the results
results_folder = os.path.join(org_dir , "SINGLE GS METHOD")

# The gold standard image (what we compare everything to)
gold_image_path = os.path.join(base_dir , "GOLD STANDARD.tif")

# List of all the method images we want to test
method_image_paths = [
    os.path.join(base_dir , "60%.tif"),
    os.path.join(base_dir , "FREEHAND.tif"), 
    os.path.join(base_dir , "ILASTIK.tif"),
    os.path.join(base_dir , "OVAL.tif"),
    os.path.join(base_dir , "OTSU.tif"),
    os.path.join(base_dir , "PLANKSTER.tif"),
    os.path.join(base_dir , "PORED2.tif"),
    os.path.join(base_dir , "SAMJ.tif"),
    os.path.join(base_dir , "SEMI.tif"),
    os.path.join(base_dir , "UNET.tif")
]

"""
Basic settings for the analysis
"""

# What to call this experiment in the results
experiment_name = "[STED 0.375%]"

# Make folders to save results
accuracy_folder = os.path.join(results_folder, "Accuracy")
figures_folder = results_folder
stats_folder = os.path.join(results_folder, "Stats")

# Size of image patches to analyze (64x64 pixels)
patch_size = 64

# Make sure the folders exist
os.makedirs(results_folder, exist_ok=True)
os.makedirs(accuracy_folder, exist_ok=True)
os.makedirs(stats_folder, exist_ok=True)

In [4]:
# Read images and convert them to binary masks
# Pores are black pixels, we convert them to white (value 1)

"""
Try to import image reading libraries
We use different libraries as backup in case one doesn't work
"""
try:
    import tifffile as tiff
    has_tifffile = True
except:
    has_tifffile = False

try:
    from PIL import Image
    has_pil = True
except:
    has_pil = False

def read_image_file(file_path):
    """
    Read an image file using different methods
    Try OpenCV first, then other libraries if that fails
    """
    # Try OpenCV first
    img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
    if img is not None:
        return img
    
    # Try tifffile library if available
    if has_tifffile:
        try:
            return tiff.imread(file_path)
        except:
            pass
    
    # Try PIL as last option
    if has_pil:
        try:
            with Image.open(file_path) as pil_img:
                return np.array(pil_img.convert("L"))
        except:
            pass
    
    return None

def make_grayscale(image_array):
    """
    Convert image to grayscale if it has multiple channels
    """
    if image_array is None:
        return None
    
    # Already grayscale
    if len(image_array.shape) == 2:
        return image_array
    
    # Color image - convert to grayscale
    if len(image_array.shape) == 3:
        return cv2.cvtColor(image_array, cv2.COLOR_BGR2GRAY)
    
    return image_array

def convert_to_binary(gray_image):
    """
    Convert grayscale image to binary
    Black pixels (pores) become 1, white pixels become 0
    """
    # Check if image is already binary (only 2 values)
    unique_values = np.unique(gray_image)
    if len(unique_values) == 2:
        # Make black pixels = 1, white pixels = 0
        black_value = unique_values[0]
        binary_image = (gray_image == black_value).astype(np.uint8)
        return binary_image
    
    # Use automatic thresholding to make binary
    _, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Convert so black pixels = 1
    binary_image = (threshold_image == 0).astype(np.uint8)
    return binary_image

def load_mask_image(image_path):
    """
    Load an image file and convert it to a binary mask
    """
    # Check if file exists
    if not os.path.exists(image_path):
        print("Error: file not found -", image_path)
        return None
    
    # Read the image
    raw_image = read_image_file(image_path)
    if raw_image is None:
        print("Error: could not read image -", image_path)
        return None
    
    # Convert to grayscale
    gray_image = make_grayscale(raw_image)
    
    # Convert to binary mask
    binary_mask = convert_to_binary(gray_image)
    
    return binary_mask

"""
Load the gold standard image (the correct answer we compare to)
"""
gold_standard_mask = load_mask_image(gold_image_path)

In [5]:
# Calculate how good each method is at finding pores
# We skip making overlay images to save time

import math

"""
Helper functions to calculate basic metrics
"""

def count_pixels(true_mask, predicted_mask):
    """
    Count true positives, false positives, etc.
    """
    # Make sure both masks are 0s and 1s
    true_binary = (true_mask > 0).astype(int)
    pred_binary = (predicted_mask > 0).astype(int)
    
    # Count each type of pixel (convert to regular Python numbers)
    true_positive = int(np.sum((true_binary == 1) & (pred_binary == 1)))
    true_negative = int(np.sum((true_binary == 0) & (pred_binary == 0)))
    false_positive = int(np.sum((true_binary == 0) & (pred_binary == 1)))
    false_negative = int(np.sum((true_binary == 1) & (pred_binary == 0)))
    
    return true_positive, false_positive, true_negative, false_negative

def safe_divide(top_number, bottom_number):
    """
    Divide two numbers safely (avoid dividing by zero)
    """
    if bottom_number == 0:
        return 0.0
    return float(top_number) / float(bottom_number)

def calculate_metrics(true_mask, predicted_mask):
    """
    Calculate all the important metrics for comparing masks
    """
    tp, fp, tn, fn = count_pixels(true_mask, predicted_mask)

    # Basic metrics
    accuracy = safe_divide(tp + tn, tp + tn + fp + fn)
    precision = safe_divide(tp, tp + fp)
    recall = safe_divide(tp, tp + fn)
    specificity = safe_divide(tn, tn + fp)

    # Combined metrics
    balanced_accuracy = 0.5 * (recall + specificity)
    dice_score = safe_divide(2 * tp, 2 * tp + fp + fn)
    iou_score = safe_divide(tp, tp + fp + fn)

    # MCC calculation (avoid overflow by using floats)
    mcc = 0.0
    try:
        # Convert everything to float first to avoid overflow
        tp_f = float(tp)
        tn_f = float(tn)
        fp_f = float(fp)
        fn_f = float(fn)
        
        # Calculate denominator parts separately
        denom1 = (tp_f + fp_f) * (tp_f + fn_f)
        denom2 = (tn_f + fp_f) * (tn_f + fn_f)
        
        if denom1 > 0 and denom2 > 0:
            denominator = math.sqrt(denom1 * denom2)
            if denominator > 0:
                numerator = (tp_f * tn_f) - (fp_f * fn_f)
                mcc = numerator / denominator
                # Keep MCC between -1 and 1
                mcc = max(-1.0, min(1.0, mcc))
    except:
        mcc = 0.0

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "specificity": specificity,
        "balanced_accuracy": balanced_accuracy,
        "f1_dice": dice_score,
        "iou_jaccard": iou_score,
        "mcc": mcc,
        "TP": tp, "FP": fp, "TN": tn, "FN": fn
    }

def analyze_image_patches(true_mask, predicted_mask, patch_size=64):
    """
    Break image into small squares and calculate metrics for each
    """
    height, width = true_mask.shape
    patch_results = []
    
    # Go through image in patch_size x patch_size squares
    for y in range(0, height, patch_size):
        for x in range(0, width, patch_size):
            # Get the boundaries of this patch
            y_end = min(y + patch_size, height)
            x_end = min(x + patch_size, width)
            
            # Extract the patch from both images
            true_patch = true_mask[y:y_end, x:x_end]
            pred_patch = predicted_mask[y:y_end, x:x_end]
            
            # Skip tiny patches
            if true_patch.size < 10:
                continue
            
            # Calculate metrics for this patch
            patch_metrics = calculate_metrics(true_patch, pred_patch)
            patch_results.append(patch_metrics)
    
    return patch_results

"""
Test each segmentation method
"""
method_names = []
method_results = {}

print("Starting analysis of all methods...")

for method_path in method_image_paths:
    # Get the method name from the file path
    method_name = os.path.basename(method_path).replace('.tif', '')
    method_names.append(method_name)
    
    print("Testing method:", method_name)
    
    try:
        # Load the predicted mask
        predicted_mask = load_mask_image(method_path)
        
        # Check if images are the same size
        if gold_standard_mask.shape != predicted_mask.shape:
            print("  Error: size mismatch for", method_name)
            continue
        
        # Analyze this method using patches
        patch_results = analyze_image_patches(gold_standard_mask, predicted_mask, patch_size)
        
        # If no patches, analyze the whole image
        if len(patch_results) == 0:
            whole_image_result = calculate_metrics(gold_standard_mask, predicted_mask)
            patch_results = [whole_image_result]
        
        # Store results
        method_results[method_name] = patch_results
        
        # Print quick summary
        dice_scores = [result["f1_dice"] for result in patch_results]
        mean_dice = np.mean(dice_scores)
        
        print("  Patches:", len(patch_results), "Mean Dice:", round(mean_dice, 3))
        
    except Exception as e:
        print("  Error processing", method_name, ":", str(e))
        continue

print("Methods tested:", len(method_results))

Starting analysis of all methods...
Testing method: 60%
  Patches: 81 Mean Dice: 0.65
Testing method: FREEHAND
  Patches: 81 Mean Dice: 0.167
Testing method: OVAL
  Patches: 81 Mean Dice: 0.138
Testing method: ILASTIK
  Patches: 81 Mean Dice: 0.541
Testing method: OTSU
  Patches: 81 Mean Dice: 0.031
Testing method: PLANKSTER
  Patches: 81 Mean Dice: 0.635
Testing method: PORED2
  Patches: 81 Mean Dice: 0.511
Testing method: SAMJ
  Patches: 81 Mean Dice: 0.126
Testing method: SEMI
  Patches: 81 Mean Dice: 0.663
Testing method: UNET
  Patches: 81 Mean Dice: 0.628
Methods tested: 10


In [6]:
# Compare methods to find the best one and see how they differ
# We use statistical tests to see which differences are real

"""
Find the best method (highest average Dice score)
"""
method_dice_averages = {}
for method_name, results_list in method_results.items():
    dice_scores = [result["f1_dice"] for result in results_list]
    average_dice = np.mean(dice_scores)
    method_dice_averages[method_name] = average_dice

# Find which method has the highest average
best_method = max(method_dice_averages, key=method_dice_averages.get)
print("Best method (highest Dice score):", best_method)

"""
Compare all other methods to the best one
Simple statistical comparison without complex bootstrap
"""
best_method_scores = [result["f1_dice"] for result in method_results[best_method]]

comparison_results = []
for method_name, results_list in method_results.items():
    if method_name == best_method:
        continue  # Skip comparing best method to itself
    
    method_scores = [result["f1_dice"] for result in results_list]
    
    # Calculate basic difference
    best_avg = np.mean(best_method_scores)
    method_avg = np.mean(method_scores)
    difference = best_avg - method_avg
    
    # Simple statistical test
    if len(method_scores) > 1 and len(best_method_scores) > 1:
        try:
            from scipy.stats import ttest_rel
            stat, p_value = ttest_rel(best_method_scores[:len(method_scores)], method_scores)
        except:
            p_value = 1.0  # If test fails, assume no difference
    else:
        p_value = 1.0
    
    comparison_results.append({
        'method': method_name,
        'difference': difference,
        'p_value': p_value
    })

# Sort by p-value (most significant first)
comparison_results.sort(key=lambda x: x['p_value'])

"""
Calculate pore fraction bias (does method find too many or too few pores?)
"""
def calculate_pore_fraction(mask):
    """Calculate what fraction of pixels are pores"""
    return np.mean(mask)

def get_patch_pore_fractions(true_mask, pred_mask, patch_size=64):
    """Get pore fractions for each patch"""
    height, width = true_mask.shape
    true_fractions = []
    pred_fractions = []
    
    for y in range(0, height, patch_size):
        for x in range(0, width, patch_size):
            y_end = min(y + patch_size, height)
            x_end = min(x + patch_size, width)
            
            true_patch = true_mask[y:y_end, x:x_end]
            pred_patch = pred_mask[y:y_end, x:x_end]
            
            if true_patch.size < 10:  # Skip tiny patches
                continue
            
            true_fractions.append(calculate_pore_fraction(true_patch))
            pred_fractions.append(calculate_pore_fraction(pred_patch))
    
    return np.array(true_fractions), np.array(pred_fractions)

bias_results = {}
for method_path in method_image_paths:
    method_name = os.path.basename(method_path).replace('.tif', '')
    
    # Load the predicted mask
    predicted_mask = load_mask_image(method_path)
    
    # Get pore fractions for each patch
    true_fractions, pred_fractions = get_patch_pore_fractions(gold_standard_mask, predicted_mask)
    
    # Calculate bias (positive = over-estimates pores, negative = under-estimates)
    bias_values = pred_fractions - true_fractions
    mean_bias = np.mean(bias_values)
    
    bias_results[method_name] = {
        'mean_bias': mean_bias,
        'std_bias': np.std(bias_values) if len(bias_values) > 1 else 0.0,
        'num_patches': len(bias_values)
    }

"""
Set up colors and styles for plotting
"""
# Define method categories
traditional_methods = ["FREEHAND", "OVAL"]
semi_auto_methods = ["SEMI", "SAMJ", "ILASTIK", "60%"]
fully_auto_methods = ["PORED2", "UNET", "OTSU", "PLANKSTER"]

# Colors for each category
traditional_color = "#9ecae1"  # Light blue
semi_auto_color = "#d0b7ff"   # Light purple
fully_auto_color = "#f7b6b6"  # Light red
other_color = "#dddddd"       # Gray

def get_method_color(method_name):
    """Get color based on method type"""
    name_upper = method_name.upper()
    
    if any(trad in name_upper for trad in traditional_methods):
        return traditional_color
    elif any(semi in name_upper for semi in semi_auto_methods):
        return semi_auto_color
    elif any(auto in name_upper for auto in fully_auto_methods):
        return fully_auto_color
    else:
        return other_color

# Create legend
legend_patches = [
    Patch(facecolor=traditional_color, edgecolor='black', label='Traditional'),
    Patch(facecolor=semi_auto_color, edgecolor='black', label='Semi-automated'),
    Patch(facecolor=fully_auto_color, edgecolor='black', label='Fully automated')
]

"""
Prepare data for making plots
"""
# Metric names for display
metric_display_names = {
    "f1_dice": "Dice (F1)",
    "iou_jaccard": "IoU (Jaccard)",
    "mcc": "Matthews CC",
    "precision": "Precision",
    "recall": "Recall",
    "specificity": "Specificity",
    "balanced_accuracy": "Balanced Accuracy",
    "accuracy": "Accuracy"
}

# List of metrics to analyze
metrics_to_plot = ["f1_dice", "iou_jaccard", "mcc", "precision", "recall", "specificity", "balanced_accuracy", "accuracy"]

def save_figure(fig, filename):
    """Save figure in multiple formats"""
    clean_name = filename.replace('[', '').replace(']', '').replace(':', '')
    
    tif_path = os.path.join(figures_folder, clean_name + '.tif')
    pdf_path = os.path.join(figures_folder, clean_name + '.pdf')
    
    fig.tight_layout()
    fig.savefig(tif_path, dpi=300, bbox_inches='tight')
    fig.savefig(pdf_path, bbox_inches='tight')
    plt.close(fig)
    
    print("Saved:", clean_name)

Best method (highest Dice score): SEMI


In [16]:
# Make charts and plots to show the results
# Create bar charts, box plots, heatmap, and Bland-Altman plots

import warnings
warnings.filterwarnings("ignore")

"""
Set up what we want to plot
"""
# All the metrics we calculated
metrics_to_plot = ["accuracy", "precision", "recall", "specificity", 
                  "balanced_accuracy", "f1_dice", "iou_jaccard", "mcc"]

# Key metrics for box plots
key_metrics = ["f1_dice", "iou_jaccard", "mcc"]

# Nice names for the plots
metric_names = {
    "accuracy": "Accuracy",
    "precision": "Precision", 
    "recall": "Recall",
    "specificity": "Specificity",
    "balanced_accuracy": "Balanced Accuracy",
    "f1_dice": "Dice Score",
    "iou_jaccard": "IoU Score",
    "mcc": "MCC"
}

"""
Create legend for method groups
"""
def create_method_legend():
    """Make legend patches for different method types"""
    from matplotlib.patches import Patch
    
    legend_patches = [
        Patch(color='blue', label='Traditional'),
        Patch(color='purple', label='Semi-Automated'), 
        Patch(color='red', label='Fully-Automated')
    ]
    return legend_patches

"""
Simple statistical tests
"""
def simple_t_test(values1, values2):
    """Compare two groups of values"""
    try:
        from scipy.stats import ttest_ind
        if len(values1) >= 2 and len(values2) >= 2:
            stat, p_value = ttest_ind(values1, values2)
            return p_value
        else:
            return 1.0
    except:
        return 1.0

"""
Prepare the data for plotting
"""
print("Preparing data for plots...")

# Make summary data for each method
summary_data = {}
for method_name, results_list in method_results.items():
    summary_data[method_name] = {}
    
    for metric in metrics_to_plot:
        values = [result[metric] for result in results_list if metric in result]
        if values:
            mean_val = np.mean(values)
            std_val = np.std(values) if len(values) > 1 else 0
            
            summary_data[method_name][metric] = {
                'mean': mean_val,
                'std': std_val,
                'values': values,
                'count': len(values)
            }

# Create legend patches
legend_patches = create_method_legend()

"""
Create bar charts for each metric
"""
print("Creating bar charts...")

for metric in metrics_to_plot:
    # Get data for this metric
    methods_for_plot = []
    means_for_plot = []
    stds_for_plot = []
    colors_for_plot = []
    
    for method_name, method_summary in summary_data.items():
        if metric in method_summary:
            methods_for_plot.append(method_name)
            means_for_plot.append(method_summary[metric]['mean'])
            stds_for_plot.append(method_summary[metric]['std'])
            colors_for_plot.append(get_method_color(method_name))
    
    if not methods_for_plot:
        continue
    
    # Sort by performance (best first)
    combined = list(zip(methods_for_plot, means_for_plot, stds_for_plot, colors_for_plot))
    combined.sort(key=lambda x: x[1], reverse=True)
    
    methods_sorted = [x[0] for x in combined]
    means_sorted = [x[1] for x in combined]
    stds_sorted = [x[2] for x in combined]
    colors_sorted = [x[3] for x in combined]
    
    # Create the bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    
    x_positions = range(len(methods_sorted))
    bars = ax.bar(x_positions, means_sorted, yerr=stds_sorted,
                  color=colors_sorted, edgecolor='black', linewidth=0.8,
                  capsize=3)
    
    ax.set_xticks(x_positions)
    ax.set_xticklabels(methods_sorted, rotation=45, ha='right')
    ax.set_ylabel(metric_names.get(metric, metric))
    ax.set_title(experiment_name + ' - ' + metric_names.get(metric, metric))
    ax.set_ylim(0, 1.1)
    ax.grid(axis='y', alpha=0.3)
    
    # Add legend in top right corner
    ax.legend(handles=legend_patches, loc='upper right', frameon=True,
              fancybox=True, shadow=True, framealpha=0.9)
    
    # Save the figure
    save_figure(fig, experiment_name + ' Bar Chart ' + metric_names.get(metric, metric))

"""
Create box plots for key metrics
"""
print("Creating box plots...")

for metric in key_metrics:
    if metric not in metrics_to_plot:
        continue
    
    # Get all values for each method
    box_data = []
    box_labels = []
    box_colors = []
    
    method_values = {}
    for method_name, method_summary in summary_data.items():
        if metric in method_summary:
            method_values[method_name] = method_summary[metric]['values']
    
    # Sort methods by median performance
    sorted_methods = sorted(method_values.keys(), 
                          key=lambda m: np.median(method_values[m]), 
                          reverse=True)
    
    for method_name in sorted_methods:
        box_data.append(method_values[method_name])
        box_labels.append(method_name)
        box_colors.append(get_method_color(method_name))
    
    if not box_data:
        continue
    
    # Create box plot
    fig, ax = plt.subplots(figsize=(10, 6))
    
    bp = ax.boxplot(box_data, labels=box_labels, patch_artist=True)
    
    # Color the boxes
    for patch, color in zip(bp['boxes'], box_colors):
        patch.set_facecolor(color)
        patch.set_edgecolor('black')
    
    # Style the other box plot elements
    for element in ['whiskers', 'caps', 'medians']:
        for item in bp[element]:
            item.set_color('black')
    
    ax.set_ylabel(metric_names.get(metric, metric))
    ax.set_title(experiment_name + ' - ' + metric_names.get(metric, metric))
    ax.set_xticklabels(box_labels, rotation=45, ha='right')
    ax.set_ylim(0, 1.1)
    ax.grid(axis='y', alpha=0.3)
    
    # Add legend in top right corner
    ax.legend(handles=legend_patches, loc='upper right', frameon=True,
              fancybox=True, shadow=True, framealpha=0.9)
    
    # Save the figure
    save_figure(fig, experiment_name + ' Box Plot ' + metric_names.get(metric, metric))

"""
Create heatmap showing all metrics for all methods
"""
print("Creating heatmap...")

# Prepare data matrix
methods_list = list(summary_data.keys())
heatmap_data = []

for method_name in methods_list:
    method_row = []
    for metric in metrics_to_plot:
        if metric in summary_data[method_name]:
            method_row.append(summary_data[method_name][metric]['mean'])
        else:
            method_row.append(np.nan)
    heatmap_data.append(method_row)

heatmap_array = np.array(heatmap_data)

# Sort methods by average performance
avg_performance = np.nanmean(heatmap_array, axis=1)
sort_order = np.argsort(-avg_performance)  # Descending order

methods_sorted = [methods_list[i] for i in sort_order]
heatmap_sorted = heatmap_array[sort_order]

# Create heatmap
fig, ax = plt.subplots(figsize=(8, len(methods_sorted) * 0.4 + 2))

im = ax.imshow(heatmap_sorted, cmap='Greys', vmin=0, vmax=1, aspect='auto')

# Add text annotations
for i in range(len(methods_sorted)):
    for j in range(len(metrics_to_plot)):
        value = heatmap_sorted[i, j]
        if not np.isnan(value):
            ax.text(j, i, '{:.2f}'.format(value), 
                   ha='center', va='center', color='black')

ax.set_xticks(range(len(metrics_to_plot)))
ax.set_xticklabels([metric_names.get(m, m) for m in metrics_to_plot], 
                  rotation=45, ha='right')
ax.set_yticks(range(len(methods_sorted)))
ax.set_yticklabels(methods_sorted)
ax.set_title(experiment_name + ' - Performance Heatmap')

# Add colorbar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Performance (0-1)')

# Save the figure
save_figure(fig, experiment_name + ' Performance Heatmap')

"""
Create simple Bland-Altman plots
"""
print("Creating Bland-Altman plots...")

def make_bland_altman_plot(values_a, values_b, method_a_name, method_b_name, metric_name):
    """
    Make a simple Bland-Altman plot comparing two methods
    """
    # Convert to arrays
    a_array = np.array(values_a)
    b_array = np.array(values_b)
    
    # Calculate means and differences
    means = (a_array + b_array) / 2.0
    differences = a_array - b_array
    
    # Calculate basic statistics
    mean_diff = np.mean(differences)
    std_diff = np.std(differences) if len(differences) > 1 else 0
    
    # Simple limits of agreement
    upper_limit = mean_diff + 2 * std_diff
    lower_limit = mean_diff - 2 * std_diff
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Plot the data points
    ax.scatter(means, differences, alpha=0.7, color='black', s=40)
    
    # Plot the mean difference line
    ax.axhline(mean_diff, color='blue', linestyle='-', linewidth=2, 
               label='Mean difference: {:.3f}'.format(mean_diff))
    
    # Plot the limits
    ax.axhline(upper_limit, color='red', linestyle='--', linewidth=1, 
               label='Upper limit: {:.3f}'.format(upper_limit))
    ax.axhline(lower_limit, color='red', linestyle='--', linewidth=1, 
               label='Lower limit: {:.3f}'.format(lower_limit))
    
    # Add zero line
    ax.axhline(0, color='gray', linestyle=':', alpha=0.5)
    
    # Labels and title
    ax.set_xlabel('Mean of ' + method_a_name + ' and ' + method_b_name)
    ax.set_ylabel(method_a_name + ' - ' + method_b_name)
    ax.set_title('Bland-Altman: ' + method_a_name + ' vs ' + method_b_name + 
                ' (' + metric_names.get(metric_name, metric_name) + ')')
    
    # Add grid and legend
    ax.grid(True, alpha=0.3)
    ax.legend()
    
    # Save the plot
    plot_name = (experiment_name + ' Bland-Altman ' + method_a_name + ' vs ' + 
                method_b_name + ' ' + metric_names.get(metric_name, metric_name))
    save_figure(fig, plot_name)

# Find the best method for each key metric
best_methods = {}
for metric in key_metrics:
    best_score = 0
    best_method = None
    
    for method_name in summary_data.keys():
        if metric in summary_data[method_name]:
            score = summary_data[method_name][metric]['mean']
            if score > best_score:
                best_score = score
                best_method = method_name
    
    if best_method:
        best_methods[metric] = best_method

# Create Bland-Altman plots comparing methods to the best one
for metric in key_metrics:
    if metric not in best_methods:
        continue
    
    best_method_name = best_methods[metric]
    best_method_values = summary_data[best_method_name][metric]['values']
    
    # Compare other methods to the best one
    for method_name in summary_data.keys():
        if method_name == best_method_name:
            continue
        
        if metric not in summary_data[method_name]:
            continue
            
        method_values = summary_data[method_name][metric]['values']
        
        # Make sure we have enough data points
        min_length = min(len(best_method_values), len(method_values))
        if min_length >= 3:  # Need at least 3 points
            # Use first min_length values from each method
            values_a = best_method_values[:min_length]
            values_b = method_values[:min_length]
            
            make_bland_altman_plot(values_a, values_b, best_method_name, 
                                 method_name, metric)

print("All plots created successfully!")

Preparing data for plots...
Creating bar charts...
Saved: STED 0.375% Bar Chart Accuracy
Saved: STED 0.375% Bar Chart Precision
Saved: STED 0.375% Bar Chart Recall
Saved: STED 0.375% Bar Chart Specificity
Saved: STED 0.375% Bar Chart Balanced Accuracy
Saved: STED 0.375% Bar Chart Dice Score
Saved: STED 0.375% Bar Chart IoU Score
Saved: STED 0.375% Bar Chart MCC
Creating box plots...
Saved: STED 0.375% Box Plot Dice Score
Saved: STED 0.375% Box Plot IoU Score
Saved: STED 0.375% Box Plot MCC
Creating heatmap...
Saved: STED 0.375% Performance Heatmap
Creating Bland-Altman plots...
Saved: STED 0.375% Bland-Altman SEMI vs 60% Dice Score
Saved: STED 0.375% Bland-Altman SEMI vs FREEHAND Dice Score
Saved: STED 0.375% Bland-Altman SEMI vs OVAL Dice Score
Saved: STED 0.375% Bland-Altman SEMI vs ILASTIK Dice Score
Saved: STED 0.375% Bland-Altman SEMI vs OTSU Dice Score
Saved: STED 0.375% Bland-Altman SEMI vs PLANKSTER Dice Score
Saved: STED 0.375% Bland-Altman SEMI vs PORED2 Dice Score
Saved: STE