## MULTIPLE GOLD STANDARD APPROACH

In [1]:
"""This cell imports all the basic Python libraries we need for image processing, data analysis, and file operations."""

import os
import re
import glob
import csv
import itertools
import sys
from math import sqrt
import numpy as np
import cv2

In [None]:
"""This cell defines where our data files are located and which segmentation methods
we want to compare against the gold standard."""

import os
from pathlib import Path
notebook_dir = Path.cwd()
proj_root = notebook_dir.parent
BASE_DIR = str(proj_root / "CRYO-SEM DATA" / "CRYO-SEM X30000")


# Setup paths to our data folders
replicate_dirs = [
    BASE_DIR / "CRYO-SEM X30000 [1]",
    BASE_DIR / "CRYO-SEM X30000 [2]",
    BASE_DIR / "CRYO-SEM X30000 [3]",
    BASE_DIR / "CRYO-SEM X30000 [4]",
]

# These are all the different methods we want to test
method_files = [
    "60%.tif",
    "FREEHAND.tif", 
    "OVAL.tif",
    "ILASTIK.tif",
    "OTSU.tif",
    "PLANKSTER.tif",
    "PORED2.tif",
    "SAMJ.tif",
    "SEMI.tif",
    "UNET.tif",
]

FAMILY_TAG = "CRYO-SEM X30000"
display_family_tag = "[" + FAMILY_TAG + "]"
group_dir = BASE_DIR / "MULTIPLE GS METHOD"

In [3]:
"""This cell tries to import optional libraries for statistics and image processing,
keeping track of which ones are available to use later."""

# Try to import scipy for better statistics
try:
    from scipy.stats import t as tdist
    from scipy.stats import kruskal
    from scipy.stats import wilcoxon 
    from scipy.stats import binomtest
    from scipy.stats import t as analysis_t
    got_scipy = True
except Exception:
    got_scipy = False

# Try to import different image reading libraries as backup options
try:
    import tifffile as tiff
    got_tifffile = True
except Exception:
    got_tifffile = False

try:
    from PIL import Image
    got_pil = True
except Exception:
    got_pil = False

# Try to import TIFF writing libraries
try:
    import tifffile as tiffw
    got_tiff_write = True
except Exception:
    got_tiff_write = False

try:
    from PIL import Image as PILImage
    got_pil_write = True
except Exception:
    got_pil_write = False

In [4]:
"""This cell creates functions to automatically find gold standard files and read TIFF
images using multiple backup methods in case one fails."""

def find_gold_mask(rep_dir):
    """
    Find the gold standard file in each replicate folder
    The files might have different names so we search carefully
    """
    if not os.path.isdir(rep_dir):
        return None
    
    # Look for files that match gold standard pattern
    pattern = re.compile(r"(?i)^gold standard\s*(\[\s*\d+\s*\])?\.(tif|tiff)$")
    try:
        candidates = [os.path.join(rep_dir, f) for f in os.listdir(rep_dir) if pattern.match(f)]
    except Exception:
        candidates = []
    
    # If no exact matches try broader search
    if not candidates:
        for ext in ("tif","tiff"):
            candidates += glob.glob(os.path.join(rep_dir, "GOLD STANDARD*." + ext))
            candidates += glob.glob(os.path.join(rep_dir, "Gold Standard*." + ext))
            candidates += glob.glob(os.path.join(rep_dir, "gold standard*." + ext))
    
    if not candidates:
        return None
    
    # Try to match the replicate number if possible
    m = re.search(r"\[(\d+)\]", os.path.basename(rep_dir))
    if m:
        idx = m.group(1)
        for p in candidates:
            if re.search(rf"\[\s*{idx}\s*\]", os.path.basename(p)):
                return p
    
    return sorted(candidates)[0]

def read_tiff_image(path):
    """
    Try to read a TIFF image using different methods
    OpenCV is first choice but we have backups
    """
    # Try OpenCV first since its usually reliable
    img = cv2.imread(path, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_GRAYSCALE)
    if img is not None:
        return img
    
    # Try tifffile library if OpenCV fails
    if got_tifffile:
        try:
            return tiff.imread(path)
        except Exception:
            pass
    
    # Try PIL as last resort
    if got_pil:
        try:
            with Image.open(path) as im:
                if "I;16" in im.mode:
                    im = im.convert("I;16")
                else:
                    im = im.convert("L")
                return np.array(im)
        except Exception:
            pass
    
    return None

In [5]:
"""This cell defines functions to convert images to grayscale, prepare them for analysis,
and calculate optimal thresholds for separating foreground from background."""

def convert_to_grayscale(arr):
    """
    Convert image to grayscale if it has multiple channels
    We need single channel images for our analysis
    """
    if arr.ndim == 2:
        return arr
    
    # Handle RGB or RGBA images
    if arr.ndim == 3 and arr.shape[-1] in (3,4):
        a = arr
        # Convert to uint8 if needed for OpenCV functions
        if a.dtype != np.uint8:
            a_min = float(a.min())
            a_max = float(a.max())
            a = ((a - a_min) / (a_max - a_min + 1e-12) * 255.0).astype(np.uint8)
        
        # Remove alpha channel if present
        if a.shape[-1] == 4:
            a = cv2.cvtColor(a, cv2.COLOR_BGRA2BGR)
        return cv2.cvtColor(a, cv2.COLOR_BGR2GRAY)
    
    # If more than 3D just take first channel
    if arr.ndim > 2:
        return convert_to_grayscale(arr[..., 0])
    
    return arr

def make_image_uint(img):
    """
    Convert image to unsigned integer format for processing
    This helps with thresholding operations
    """
    if img.dtype in (np.uint8, np.uint16):
        return img
    
    # Handle floating point images by scaling to 0-255
    if np.issubdtype(img.dtype, np.floating):
        a_min = float(img.min())
        a_max = float(img.max())
        if a_max > a_min:
            scaled = (img - a_min) / (a_max - a_min)
        else:
            scaled = np.zeros_like(img, float)
        return (scaled * 255.0 + 0.5).astype(np.uint8)
    
    # Convert other types appropriately
    return img.astype(np.uint16 if img.max() > 255 else np.uint8)

def otsu_threshold_manual(g8):
    """
    Calculate Otsu threshold manually if OpenCV doesnt have it
    This finds the best threshold to separate foreground from background
    The method tries every possible threshold and picks the best one
    """
    hist = np.bincount(g8.ravel(), minlength=256).astype(np.float64)
    total = g8.size
    if total == 0:
        return 0
    
    sum_total = np.dot(np.arange(256, dtype=np.float64), hist)
    sumB = 0.0
    wB = 0.0 
    maximum = -1.0
    threshold = 0
    
    # Try each possible threshold value to find the best separation
    for t in range(256):
        wB += hist[t]
        if wB == 0: 
            continue
        wF = total - wB
        if wF == 0: 
            break
        
        sumB += t * hist[t]
        mB = sumB / wB
        mF = (sum_total - sumB) / wF
        between = wB * wF * (mB - mF) ** 2
        
        if between > maximum:
            maximum = between
            threshold = t
    
    return threshold

In [6]:
"""This cell converts grayscale images into binary masks where pores are marked as 1
and background as 0, using automatic thresholding."""

def make_binary_mask(img_gray):
    """
    Convert grayscale image to binary mask where pores are 1 and background is 0
    We assume pores are the dark pixels (black equals pore)
    """
    g = make_image_uint(img_gray)
    unique_vals = np.unique(g)
    
    # If only two values use the darker one as pores
    if unique_vals.size == 2:
        pores = (g == int(unique_vals[0]))
        return pores.astype(np.uint8)
    
    # Convert to 8-bit for thresholding
    g8 = (g / 257).astype(np.uint8) if g.dtype == np.uint16 else g.astype(np.uint8)
    
    # Try to use OpenCV Otsu thresholding first
    if hasattr(cv2, "THRESH_OTSU"):
        _, th = cv2.threshold(g8, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return (th == 0).astype(np.uint8)
    else:
        # Use manual Otsu calculation as fallback
        thr = otsu_threshold_manual(g8)
        th = (g8 > thr).astype(np.uint8) * 255
        return (th == 0).astype(np.uint8)

def read_mask(path):
    """
    Read an image file and convert it to a binary mask
    Returns numpy array where 1 equals pore and 0 equals background
    """
    if not os.path.exists(path):
        raise FileNotFoundError("file does not exist: " + path)
    
    raw = read_tiff_image(path)
    if raw is None:
        raise FileNotFoundError("cannot read image: " + path)
    
    gray = convert_to_grayscale(raw)
    if gray.ndim != 2:
        raise ValueError("not a single-channel image: " + path)
    
    return make_binary_mask(gray)

In [7]:
"""This cell creates colorful overlay images that visually show where each method agrees
or disagrees with the gold standard using green, red, and blue colors."""

def save_overlay_image(gt, pr, save_path_tif):
    """
    Create and save an overlay image showing the comparison results
    True Positives show as green outlines
    False Positives show as red fill
    False Negatives show as blue fill
    Gold standard forms the grayscale background
    """
    # Create grayscale background from gold standard
    base_gray = (1 - gt) * 255
    overlay = cv2.cvtColor(base_gray.astype(np.uint8), cv2.COLOR_GRAY2BGR)

    # Calculate different types of pixels for comparison
    tp = (gt == 1) & (pr == 1)  # True Positive: both methods found pore
    fp = (gt == 0) & (pr == 1)  # False Positive: method found pore but gold standard didnt
    fn = (gt == 1) & (pr == 0)  # False Negative: gold standard has pore but method missed it

    # Add False Positives as red fill
    fp_layer = overlay.copy()
    fp_layer[fp] = (0, 0, 255)  # BGR format so this is red
    overlay = cv2.addWeighted(fp_layer, 0.45, overlay, 0.55, 0)

    # Add False Negatives as blue fill  
    fn_layer = overlay.copy()
    fn_layer[fn] = (255, 0, 0)  # BGR format so this is blue
    overlay = cv2.addWeighted(fn_layer, 0.45, overlay, 0.55, 0)

    # Add True Positives as green outline
    edges = cv2.Canny((tp.astype(np.uint8) * 255), 50, 150)
    overlay[edges > 0] = (0, 255, 0)  # BGR format so this is green

    # Make sure output directory exists
    os.makedirs(os.path.dirname(save_path_tif), exist_ok=True)
    if not save_path_tif.lower().endswith((".tif", ".tiff")):
        save_path_tif += ".tif"

    # Try different methods to save the image
    success = False
    try:
        success = cv2.imwrite(save_path_tif, overlay)
    except Exception:
        success = False
    
    if not success and got_tiff_write:
        try:
            tiffw.imwrite(save_path_tif, overlay)
            success = True
        except Exception:
            success = False
    
    if not success and got_pil_write:
        try:
            # PIL uses RGB but OpenCV uses BGR so we need to flip
            PILImage.fromarray(overlay[:, :, ::-1]).save(save_path_tif, format="TIFF")
            success = True
        except Exception:
            success = False
    
    if not success:
        raise IOError("failed to write TIFF: " + save_path_tif)

In [8]:
"""This cell calculates accuracy metrics like Dice coefficient, precision, recall,
and others that tell us how well each segmentation method performs."""

def safe_divide(numerator, denominator):
    """Helper function to avoid division by zero errors"""
    return float(numerator) / float(denominator) if denominator else 0.0

def count_pixels(gt, pred):
    """
    Count True Positives False Positives True Negatives False Negatives
    These counts tell us how the prediction compares to ground truth
    """
    gt = gt.astype(np.uint8)
    pred = pred.astype(np.uint8)
    
    tp = int(np.sum((gt == 1) & (pred == 1)))  # Both say pore
    tn = int(np.sum((gt == 0) & (pred == 0)))  # Both say background  
    fp = int(np.sum((gt == 0) & (pred == 1)))  # Pred says pore truth says background
    fn = int(np.sum((gt == 1) & (pred == 0)))  # Pred says background truth says pore
    
    return tp, fp, tn, fn

def compute_metrics(gt, pred):
    """
    Calculate all the performance metrics we need
    These tell us how good a method is at finding pores correctly
    """
    tp, fp, tn, fn = count_pixels(gt, pred)
    
    # Basic accuracy: how many pixels did we get right overall?
    acc = safe_divide(tp + tn, tp + tn + fp + fn)
    
    # Precision: of the pixels we called pores how many really were pores?
    prec = safe_divide(tp, tp + fp)
    
    # Recall (Sensitivity): of the real pores how many did we find?
    rec = safe_divide(tp, tp + fn)
    
    # Specificity: of the real background pixels how many did we get right?
    spec = safe_divide(tn, tn + fp)
    
    # Balanced accuracy: average of sensitivity and specificity
    ba = 0.5 * (rec + spec)
    
    # Dice coefficient (also called F1 score): harmonic mean of precision and recall
    dice = safe_divide(2 * tp, 2 * tp + fp + fn)
    
    # IoU (Intersection over Union): area of overlap divided by area of union
    iou = safe_divide(tp, tp + fp + fn)
    
    # Matthews Correlation Coefficient: overall measure of quality
    denominator_product = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    mcc = safe_divide(tp*tn - fp*fn, sqrt(float(denominator_product))) if denominator_product > 0 else 0.0
    
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "specificity": spec,
        "balanced_accuracy": ba,
        "f1_dice": dice,
        "iou_jaccard": iou,
        "mcc": mcc,
        "TP": tp, 
        "FP": fp, 
        "TN": tn, 
        "FN": fn,
    }

def calculate_confidence_interval(mean, sd, n):
    """
    Calculate 95% confidence interval for the mean
    This tells us the range where the true mean probably lies
    """
    if n is None or n < 2 or sd is None or np.isnan(sd):
        return (np.nan, np.nan)
    
    # Use t-distribution if we have scipy otherwise use normal approximation
    if got_scipy:
        t_critical = float(tdist.ppf(0.975, df=n-1))
    else:
        t_critical = 1.96  # Normal approximation
    
    margin_of_error = t_critical * sd / sqrt(n)
    return (mean - margin_of_error, mean + margin_of_error)

In [9]:
"""This cell processes each replicate folder individually, comparing every segmentation
method against the gold standard and saving results as CSV files and overlay images."""

print("Starting analysis of each replicate...")

# Process each replicate folder one by one
for root in replicate_dirs:
    print("Processing replicate:", os.path.basename(root))
    
    gold_path = find_gold_mask(root)
    output_dir = os.path.join(root, "Accuracy")
    overlay_dir = os.path.join(output_dir, "Overlays_TIFF")
    
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(overlay_dir, exist_ok=True)

    # Skip if no gold standard found
    if not gold_path or not os.path.exists(gold_path):
        print("SKIPPING - no gold standard found in:", root)
        continue

    # Load the gold standard mask
    gt_mask = read_mask(gold_path)
    replicate_tag = display_family_tag + " " + os.path.basename(root)

    # Test each method against the gold standard
    for method_file in method_files:
        prediction_path = os.path.join(root, method_file)
        
        # Skip if method file doesnt exist
        if not os.path.exists(prediction_path):
            print("SKIPPING - file not found:", prediction_path)
            continue

        method_name = os.path.splitext(os.path.basename(prediction_path))[0]
        
        # Load the prediction mask
        prediction_mask = read_mask(prediction_path)

        # Make sure images are the same size
        if gt_mask.shape != prediction_mask.shape:
            error_msg = root + " -> shape mismatch for " + method_name + ": " + str(gt_mask.shape) + " vs " + str(prediction_mask.shape)
            raise ValueError(error_msg)

        # Calculate performance metrics
        metrics = compute_metrics(gt_mask, prediction_mask)

        # Save metrics to CSV file
        csv_file_path = os.path.join(output_dir, "Metric Results [" + method_name + "].csv")
        with open(csv_file_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["metric","value"])
            
            # Write main metrics in order of importance
            for metric_name in ["f1_dice","iou_jaccard","precision","recall","specificity",
                              "balanced_accuracy","accuracy","mcc"]:
                writer.writerow([metric_name, metrics[metric_name]])
            
            # Add empty row for separation
            writer.writerow([])
            
            # Write confusion matrix values
            writer.writerow(["TP", metrics["TP"]])
            writer.writerow(["FP", metrics["FP"]])
            writer.writerow(["TN", metrics["TN"]]) 
            writer.writerow(["FN", metrics["FN"]])

        # Create and save overlay image
        overlay_file_path = os.path.join(overlay_dir, method_name + " " + replicate_tag + " overlay.tif")
        save_overlay_image(gt_mask, prediction_mask, overlay_file_path)

        print(method_name + " " + replicate_tag + ": Dice " + str(round(metrics['f1_dice'], 3)) + " -> saved to " + csv_file_path)

Starting analysis of each replicate...
Processing replicate: CRYO-SEM X30000 [1]
SKIPPING - no gold standard found in: C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X30000\CRYO-SEM X30000 [1]
Processing replicate: CRYO-SEM X30000 [2]
SKIPPING - no gold standard found in: C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X30000\CRYO-SEM X30000 [2]
Processing replicate: CRYO-SEM X30000 [3]
SKIPPING - no gold standard found in: C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X30000\CRYO-SEM X30000 [3]
Processing replicate: CRYO-SEM X30000 [4]
SKIPPING - no gold standard found in: C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X30000\CRYO-SEM X30000 [4]


In [10]:
"""This cell collects data from all replicates and creates a master dataset showing
how each method performed across all test images."""

print("\nCreating group summary across all replicates...")

# List of metrics we want to summarize
metrics_to_summarize = ["f1_dice","iou_jaccard","precision","recall","specificity",
                       "balanced_accuracy","accuracy","mcc"]

# Collect data from all replicates for group analysis
all_replicate_data = []
for method_file in method_files:
    method_name = os.path.splitext(method_file)[0]
    
    for root in replicate_dirs:
        gold_standard_path = find_gold_mask(root)
        prediction_path = os.path.join(root, method_file)
        
        # Skip if either file is missing
        if not (gold_standard_path and os.path.exists(gold_standard_path) and os.path.exists(prediction_path)):
            continue
        
        # Load both masks
        gt = read_mask(gold_standard_path)
        pr = read_mask(prediction_path)
        
        # Check size compatibility
        if gt.shape != pr.shape:
            error_msg = root + " -> shape mismatch for " + method_name + ": " + str(gt.shape) + " vs " + str(pr.shape)
            raise ValueError(error_msg)
        
        # Calculate metrics for this replicate
        m = compute_metrics(gt, pr)
        
        # Add to our collection
        row_data = [method_name, os.path.basename(root)]
        for metric in metrics_to_summarize:
            row_data.append(m[metric])
        all_replicate_data.append(row_data)

# Save per-replicate data
os.makedirs(group_dir, exist_ok=True)
per_replicate_file = os.path.join(group_dir, "Group_PerReplicate_Values [" + FAMILY_TAG + "].csv")
with open(per_replicate_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    header_row = ["method","replicate"] + metrics_to_summarize
    writer.writerow(header_row)
    writer.writerows(all_replicate_data)

print("Saved per-replicate data:", per_replicate_file)


Creating group summary across all replicates...
Saved per-replicate data: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Group_PerReplicate_Values [CRYO-SEM X30000].csv


In [11]:
"""This cell calculates mean values, standard deviations, and confidence intervals
for each method across all replicates to summarize overall performance."""

# Calculate summary statistics across replicates
summary_file = os.path.join(group_dir, "Group_Summary [" + FAMILY_TAG + "].csv")
summary_rows = []

for method_file in method_files:
    method_name = os.path.splitext(method_file)[0]
    
    # Find all rows for this method
    method_data = [row for row in all_replicate_data if row[0] == method_name]
    n_replicates = len(method_data)
    
    # Skip if no data for this method
    if n_replicates == 0:
        continue
    
    # Calculate statistics for each metric
    for metric_index, metric_name in enumerate(metrics_to_summarize):
        # Metric values start at index 2 (after method name and replicate name)
        column_index = metric_index + 2
        
        # Extract all values for this metric
        metric_values = np.array([row[column_index] for row in method_data], dtype=float)
        
        # Calculate mean
        mean_value = float(metric_values.mean())
        
        # Calculate standard deviation and confidence interval
        if n_replicates > 1:
            std_value = float(metric_values.std(ddof=1))  # Sample standard deviation
            ci_low, ci_high = calculate_confidence_interval(mean_value, std_value, n_replicates)
        else:
            # Cannot calculate std dev with only 1 sample
            std_value = float("nan")
            ci_low = float("nan") 
            ci_high = float("nan")
        
        # Save this summary row
        summary_row = [method_name, metric_name, mean_value, std_value, n_replicates, ci_low, ci_high]
        summary_rows.append(summary_row)

# Write summary statistics file
with open(summary_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    header_row = ["method", "metric", "mean", "sd", "n_images", "ci95_low", "ci95_high"]
    writer.writerow(header_row)
    writer.writerows(summary_rows)

print("Saved group summary:", summary_file)

# Print version information for reproducibility
print("\nSoftware versions used:")
print("Python:", sys.version.split()[0])
print("NumPy:", np.__version__)
print("OpenCV:", cv2.__version__)
if got_scipy:
    print("SciPy: available (used for confidence intervals)")
else:
    print("SciPy: not available (used normal approximation)")

Saved group summary: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Group_Summary [CRYO-SEM X30000].csv

Software versions used:
Python: 3.10.18
NumPy: 1.26.4
OpenCV: 4.10.0
SciPy: available (used for confidence intervals)


In [12]:
"""This cell imports plotting libraries and sets up all the directories and settings
needed to create figures and statistical analysis."""

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

# Setup for figures
FAMILY_DIR = group_dir
N_IN_TITLE = 4  # number of gold standard images for titles

# Settings for Bland Altman plots
ENABLE_BLAND_ALTMAN = True
BA_PAIRS_VS_REF_ONLY = True
BA_MIN_COMMON = 2

SUMMARY_CSV = os.path.join(FAMILY_DIR, "Group_Summary [" + FAMILY_TAG + "].csv")
PERREP_CSV  = os.path.join(FAMILY_DIR, "Group_PerReplicate_Values [" + FAMILY_TAG + "].csv")
FIG_DIR   = os.path.join(FAMILY_DIR, "Figures")
STATS_DIR = os.path.join(FAMILY_DIR, "Stats")
BA_DIR    = os.path.join(FIG_DIR, "Bland-Altman")
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(STATS_DIR, exist_ok=True)
os.makedirs(BA_DIR, exist_ok=True)

np.random.seed(2025)

In [13]:
"""This cell defines the visual style, colors, and categories for all plots,
making traditional methods blue, semi-automated purple, and fully automated red."""

# Set up plot styling
mpl.rcParams.update({
    "font.size": 9,
    "axes.linewidth": 0.8,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "xtick.direction": "in",
    "ytick.direction": "in",
    "xtick.major.width": 0.8,
    "ytick.major.width": 0.8,
    "pdf.fonttype": 42,
    "ps.fonttype": 42,
    "savefig.dpi": 600,
})

# Define method categories and colors
TRADITIONAL = {"FREEHAND", "OVAL"}
SEMI_AUTO   = {"SEMI", "SAMJ", "ILASTIK", "60%"}
FULL_AUTO   = {"PORED2", "UNET", "OTSU", "PLANKSTER"}

C_TRAD  = "#9ecae1"  # light blue
C_SEMI  = "#d0b7ff"  # light purple
C_AUTO  = "#f7b6b6"  # light red
C_OTHER = "#dddddd"

def method_color_and_group(name):
    """Assign colors based on method type"""
    u = name.strip().upper()
    if u in TRADITIONAL: 
        return C_TRAD, "Traditional"
    if u in SEMI_AUTO:   
        return C_SEMI, "Semi-automated"
    if u in FULL_AUTO:   
        return C_AUTO, "Fully automated"
    return C_OTHER, "Other"

# Create legend handles for the plots
legend_handles = [
    Patch(facecolor=C_TRAD, edgecolor='k', label='Traditional'),
    Patch(facecolor=C_SEMI, edgecolor='k', label='Semi-automated'),
    Patch(facecolor=C_AUTO, edgecolor='k', label='Fully automated'),
]

# Define metric names for nice display
NICE = {
    "f1_dice": "Dice (F1)",
    "iou_jaccard": "IoU (Jaccard)",
    "mcc": "Matthews CC",
    "precision": "Precision",
    "recall": "Recall (Sensitivity)",
    "specificity": "Specificity",
    "balanced_accuracy": "Balanced Accuracy",
    "accuracy": "Accuracy",
}

METRICS_ALL  = ["f1_dice","iou_jaccard","mcc","precision","recall","specificity","balanced_accuracy","accuracy"]
BOX_METRICS  = ["f1_dice","iou_jaccard","mcc"]
HEAT_METRICS = ["f1_dice","iou_jaccard","mcc","precision","recall","specificity","balanced_accuracy","accuracy"]

In [14]:
"""This cell creates helper functions for reading data files, cleaning up text for
filenames, and saving figures."""

def sanitize(s):
    """Clean up strings for filenames"""
    s2 = re.sub(r'[<>:"/\\|?*]+', "_", s)
    return re.sub(r"\s+", " ", s2).strip()

def safe_float(x):
    """Convert to float safely"""
    if x is None: 
        return np.nan
    s = str(x).strip()
    if s == "" or s.lower() == "nan": 
        return np.nan
    try: 
        return float(s)
    except Exception: 
        return np.nan

def read_group_summary(path):
    """Read the group summary CSV file"""
    rows = []
    with open(path, newline="", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            rows.append({
                "method": row["method"],
                "metric": row["metric"],
                "mean":  safe_float(row.get("mean")),
                "sd":    safe_float(row.get("sd")),
                "n":     int(safe_float(row.get("n_images")) or 0),
                "lo":    safe_float(row.get("ci95_low")),
                "hi":    safe_float(row.get("ci95_high")),
            })
    return rows

def read_perrep(path):
    """Read the per-replicate CSV file"""
    rows = []
    with open(path, newline="", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            rows.append({
                "method": row["method"],
                "replicate": row["replicate"],
                "f1_dice": safe_float(row.get("f1_dice")),
                "iou_jaccard": safe_float(row.get("iou_jaccard")),
                "precision": safe_float(row.get("precision")),
                "recall": safe_float(row.get("recall")),
                "specificity": safe_float(row.get("specificity")),
                "balanced_accuracy": safe_float(row.get("balanced_accuracy")),
                "accuracy": safe_float(row.get("accuracy")),
                "mcc": safe_float(row.get("mcc")),
            })
    return rows

def table_for_metric(summary_rows, metric):
    """Get rows for a specific metric"""
    return [r for r in summary_rows if r["metric"] == metric and not np.isnan(r["mean"])]

def ci_or_fallback_lo_hi(mean, lo, hi, sd, n):
    """Calculate confidence interval with fallback"""
    if not np.isnan(lo) and not np.isnan(hi):
        return lo, hi
    if (sd is not None) and (not np.isnan(sd)) and n is not None and n >= 2:
        half = 1.96 * sd / np.sqrt(n)
        return mean - half, mean + half
    return np.nan, np.nan

def save_fig(fig, name_base, dirpath=FIG_DIR):
    """Save figure as both TIFF and PDF"""
    tif = os.path.join(dirpath, sanitize(name_base) + ".tif")
    pdf = os.path.join(dirpath, sanitize(name_base) + ".pdf")
    fig.tight_layout()
    fig.savefig(tif, dpi=600)
    fig.savefig(pdf)
    plt.close(fig)
    print("Saved:", tif)
    print("Saved:", pdf)

In [15]:
"""This cell loads the CSV files containing our calculated metrics 
so we can create plots and statistical comparisons from the data."""

# Load data for plotting
if not os.path.exists(SUMMARY_CSV):
    raise FileNotFoundError("Missing: " + SUMMARY_CSV)
if not os.path.exists(PERREP_CSV):
    raise FileNotFoundError("Missing: " + PERREP_CSV)

summary_rows = read_group_summary(SUMMARY_CSV)
perrep_rows  = read_perrep(PERREP_CSV)

In [16]:
"""This cell defines functions for performing statistical tests like Wilcoxon tests
and sign tests to determine if differences between methods are statistically significant."""

def build_series_by_replicate(metric):
    """Return dictionary of method to replicate to value"""
    d = {}
    for row in perrep_rows:
        m = row["method"]
        r = row["replicate"] 
        v = row[metric]
        if np.isnan(v) or r is None or r == "": 
            continue
        d.setdefault(m, {})[r] = float(v)
    return d

def holm_correction(pairs_pvals):
    """Apply Holm correction for multiple comparisons"""
    ranked = sorted(pairs_pvals, key=lambda x: x[1])
    m = len(ranked)
    adj = {}
    for i, (pair, p) in enumerate(ranked, start=1):
        adj[pair] = min(1.0, p * (m - i + 1))
    return adj

def p_to_stars(p):
    """Convert p-value to significance stars"""
    if p < 0.001: 
        return '***'
    if p < 0.01:  
        return '**'
    if p < 0.05:  
        return '*'
    return ''

def wilcoxon_exact_or_pratt(a, b):
    """Perform Wilcoxon test with exact or approximate method"""
    a = np.asarray(a, float)
    b = np.asarray(b, float)
    d = a - b
    nz = d != 0
    n_eff = int(np.count_nonzero(nz))
    if n_eff < 2:
        return (np.nan, np.nan, n_eff, "NA")
    if np.all(nz) and n_eff <= 25:
        res = wilcoxon(a, b, alternative='two-sided', zero_method='wilcox',
                       correction=False, method='exact')
        return (float(res.statistic), float(res.pvalue), n_eff, "exact")
    res = wilcoxon(a, b, alternative='two-sided', zero_method='pratt',
                   correction=True, method='approx')
    return (float(res.statistic), float(res.pvalue), n_eff, "approx(pratt)")

def sign_test_two_sided(a, b):
    """Perform two-sided sign test"""
    a = np.asarray(a, float)
    b = np.asarray(b, float)
    d = a - b
    pos = int(np.sum(d > 0))
    neg = int(np.sum(d < 0))
    n   = pos + neg
    if n == 0:
        return (np.nan, 0, 0)
    p = binomtest(k=pos, n=n, p=0.5, alternative='two-sided').pvalue
    return (float(p), n, pos)

def hodges_lehmann(a, b):
    """Calculate Hodges-Lehmann estimator"""
    d = np.asarray(a, float) - np.asarray(b, float)
    return float(np.median(d))

def compute_ref_and_holm(metric):
    """Compute reference method and Holm-corrected p-values"""
    tbl = table_for_metric(summary_rows, metric)
    if not tbl: 
        return None, {}, {}, {}, {}
    ref = max(tbl, key=lambda d: d["mean"])["method"]
    series = build_series_by_replicate(metric)
    methods = sorted(series.keys())
    pairs = list(itertools.combinations(methods, 2))

    wx_pairs = []
    sign_pairs = []
    wilcox_cache = {}
    sign_cache   = {}

    for a, b in pairs:
        ra = series.get(a, {})
        rb = series.get(b, {})
        common = sorted(set(ra.keys()) & set(rb.keys()))
        if len(common) < 2:
            continue
        a_vals = np.array([ra[r] for r in common], float)
        b_vals = np.array([rb[r] for r in common], float)

        W, p_wx, n_eff, mode = wilcoxon_exact_or_pratt(a_vals, b_vals)
        p_sign, n_sign, pos = sign_test_two_sided(a_vals, b_vals)
        HL = hodges_lehmann(a_vals, b_vals)
        med = float(np.median(a_vals - b_vals))

        if not np.isnan(p_wx):  
            wx_pairs.append(((a,b), p_wx))
        if not np.isnan(p_sign): 
            sign_pairs.append(((a,b), p_sign))

        wilcox_cache[(a,b)] = (n_eff, W, p_wx, mode, HL, med)
        sign_cache[(a,b)]   = (n_sign, pos, p_sign)

    p_holm_wx   = holm_correction(wx_pairs) if wx_pairs else {}
    p_holm_sign = holm_correction(sign_pairs) if sign_pairs else {}
    return ref, p_holm_wx, p_holm_sign, wilcox_cache, sign_cache

def lookup_pair(dct, a, b, default=np.nan):
    """Look up pair in dictionary with symmetric fallback"""
    return dct.get((a,b), dct.get((b,a), default))

In [17]:
"""This cell creates bar charts showing the mean performance of each method with 
error bars, ranked from best to worst performance for each metric."""

# Create bar charts for each metric
print("Creating bar charts...")
for metric in METRICS_ALL:
    tbl = table_for_metric(summary_rows, metric)
    if not tbl: 
        continue

    tbl.sort(key=lambda d: d["mean"], reverse=True)
    methods = [t["method"] for t in tbl]
    means   = np.array([t["mean"] for t in tbl], float)

    los, his, ns = np.zeros_like(means), np.zeros_like(means), []
    for i, t in enumerate(tbl):
        lo, hi = ci_or_fallback_lo_hi(t["mean"], t["lo"], t["hi"], t["sd"], t["n"])
        los[i], his[i] = lo, hi
        ns.append(t["n"])
    lower = np.where(np.isnan(los), 0.0, means - los)
    upper = np.where(np.isnan(his), 0.0, his - means)
    yerr  = np.vstack((lower, upper))

    bar_colors = [method_color_and_group(m)[0] for m in methods]

    fig, ax = plt.subplots(figsize=(7.5, 4.2))
    x = np.arange(len(methods))
    ax.bar(x, means, color=bar_colors, edgecolor="black", linewidth=0.8)
    ax.errorbar(x, means, yerr=yerr, fmt='none', ecolor='black', elinewidth=1.0, capsize=3)

    ax.set_xticks(x)
    ax.set_xticklabels(methods, rotation=45, ha='right')
    ax.set_title(FAMILY_TAG + " [" + NICE.get(metric, metric) + "] (n = " + str(N_IN_TITLE) + ")")
    ax.set_ylabel(NICE.get(metric, metric) + " (mean ± 95% CI)")

    y_tops = means + np.where(np.isnan(upper), 0.0, upper)
    ylim_top = float(np.nanmax(y_tops) + 0.05)
    ax.set_ylim(0, min(1.15, max(1.0, ylim_top)))
    ax.yaxis.set_major_locator(mpl.ticker.MultipleLocator(0.1))
    ax.grid(axis='y', color=str(0.9), linestyle='-', linewidth=0.5)

    leg = ax.legend(handles=legend_handles, loc='upper right', frameon=True)
    leg.get_frame().set_edgecolor('black')
    leg.get_frame().set_linewidth(0.8)
    leg.get_frame().set_alpha(1.0)
    leg.get_frame().set_facecolor('white')

    save_fig(fig, FAMILY_TAG + " - Bar " + NICE.get(metric, metric))

Creating bar charts...


In [18]:
"""This cell creates boxplot charts showing the distribution and variability
of performance for each method, with individual data points overlaid as dots."""

# Create boxplots for key metrics
print("Creating boxplots...")
def build_series(metric):
    """Build series data for boxplots"""
    methods = sorted(list({x["method"] for x in perrep_rows}))
    series = {}
    for m in methods:
        vals = [x[metric] for x in perrep_rows if x["method"] == m and not np.isnan(x[metric])]
        if len(vals) > 0:
            series[m] = np.array(vals, float)
    return series

for metric in BOX_METRICS:
    data = build_series(metric)
    if not data: 
        continue

    order = sorted(data.keys(), key=lambda m: np.median(data[m]), reverse=True)
    vectors = [data[m] for m in order]
    box_colors = [method_color_and_group(m)[0] for m in order]

    fig, ax = plt.subplots(figsize=(7.5, 4.2))
    bp = ax.boxplot(
        vectors, vert=True, patch_artist=True, tick_labels=order,
        whis=1.5, widths=0.65, showmeans=False, manage_ticks=True,
    )
    for patch, col in zip(bp['boxes'], box_colors):
        patch.set(facecolor=col, edgecolor="black", linewidth=0.8)
    for whisker in bp['whiskers']: 
        whisker.set(color="black", linewidth=0.8)
    for cap in bp['caps']: 
        cap.set(color="black", linewidth=0.8)
    for median in bp['medians']: 
        median.set(color="black", linewidth=1.2)
    if 'fliers' in bp:
        for fl in bp['fliers']:
            fl.set(marker='o', markerfacecolor='none', markeredgecolor='black', markersize=3, alpha=0.7)

    rng = np.random.default_rng(2025)
    for i, vals in enumerate(vectors, start=1):
        if len(vals) == 0: 
            continue
        jitter = (rng.random(size=len(vals)) - 0.5) * 0.25
        ax.plot(np.full(len(vals), i) + jitter, vals, 'o',
                markerfacecolor='none', markeredgecolor='black', markersize=3, alpha=0.6, linewidth=0.8)

    y_top = max([np.max(v) if len(v) else 0 for v in vectors] + [1.0])
    ax.set_ylim(0, min(1.15, y_top + 0.05))
    ax.set_ylabel(NICE.get(metric, metric))
    ax.set_title(FAMILY_TAG + " [" + NICE.get(metric, metric) + "] (n = " + str(N_IN_TITLE) + ")")
    ax.set_xticklabels(order, rotation=45, ha='right')
    ax.yaxis.set_major_locator(mpl.ticker.MultipleLocator(0.1))
    ax.grid(axis='y', color=str(0.92), linestyle='-', linewidth=0.5)

    leg = ax.legend(handles=legend_handles, loc='upper right', frameon=True)
    leg.get_frame().set_edgecolor('black')
    leg.get_frame().set_linewidth(0.8)
    leg.get_frame().set_alpha(1.0)
    leg.get_frame().set_facecolor('white')

    save_fig(fig, FAMILY_TAG + " - Boxplot " + NICE.get(metric, metric))

Creating boxplots...


In [19]:
"""This cell creates a heatmap showing all performance metrics for all methods
in a grid format, with darker colors indicating better performance."""

# Create heatmap of mean metrics
print("Creating heatmap...")
methods_all = sorted(list({r["method"] for r in summary_rows}))
M = []
for m in methods_all:
    row_vals = []
    for metric in HEAT_METRICS:
        matches = [r for r in summary_rows if r["method"] == m and r["metric"] == metric]
        row_vals.append(matches[0]["mean"] if matches and not np.isnan(matches[0]["mean"]) else np.nan)
    M.append(row_vals)
M = np.array(M, float)

primary_idx = [HEAT_METRICS.index("f1_dice"), HEAT_METRICS.index("iou_jaccard"), HEAT_METRICS.index("mcc")]
order_idx = np.argsort(-np.nanmean(M[:, primary_idx], axis=1))
methods_sorted = [methods_all[i] for i in order_idx]
M_sorted = M[order_idx]

fig, ax = plt.subplots(figsize=(6.6, 0.45*len(methods_sorted) + 1.2))
im = ax.imshow(M_sorted, aspect="auto", cmap="Greys", vmin=0, vmax=1.0)
for i in range(M_sorted.shape[0]):
    for j in range(M_sorted.shape[1]):
        v = M_sorted[i, j]
        if np.isnan(v): 
            continue
        ax.text(j, i, str(round(v,2)), ha='center', va='center', color='black')
ax.set_xticks(np.arange(len(HEAT_METRICS)))
ax.set_xticklabels([NICE.get(m, m) for m in HEAT_METRICS], rotation=45, ha='right')
ax.set_yticks(np.arange(len(methods_sorted)))
ax.set_yticklabels(methods_sorted)
ax.set_title(FAMILY_TAG + " — Mean metrics (methods × metrics)")
cbar = fig.colorbar(im, ax=ax, fraction=0.03, pad=0.03)
cbar.set_label("Mean (0–1)")
save_fig(fig, FAMILY_TAG + " - Heatmap mean metrics")

Creating heatmap...


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
"""This cell performs comprehensive statistical testing including Kruskal-Wallis tests,
pairwise Wilcoxon tests, and comparisons against the best reference method,
saving all results to CSV files."""

# Generate statistical test CSVs
print("Writing statistical test results...")
def write_stats_csvs():
    """Write statistical test results to CSV files"""
    kw_path  = os.path.join(STATS_DIR, "KruskalWallis [" + FAMILY_TAG + "].csv")
    wx_path  = os.path.join(STATS_DIR, "Wilcoxon_Pairs [" + FAMILY_TAG + "].csv")
    vr_path  = os.path.join(STATS_DIR, "Wilcoxon_vsRef [" + FAMILY_TAG + "].csv")

    with open(kw_path, "w", newline="", encoding="utf-8") as fkw, \
         open(wx_path, "w", newline="", encoding="utf-8") as fwx, \
         open(vr_path, "w", newline="", encoding="utf-8") as fvr:

        kw_writer = csv.writer(fkw)
        kw_writer.writerow(["metric","k_groups","H","p_value"])
        wx_writer = csv.writer(fwx)
        wx_writer.writerow([
            "metric","method_A","method_B","n_common",
            "wilcoxon_n_eff","wilcoxon_W","wilcoxon_p_raw","wilcoxon_mode","wilcoxon_p_holm",
            "sign_n","sign_pos","sign_p_raw","sign_p_holm",
            "HL_estimate(A-B)","median_diff(A-B)"
        ])
        vr_writer = csv.writer(fvr)
        vr_writer.writerow([
            "metric","reference","method","n_common",
            "wilcoxon_n_eff","wilcoxon_W","wilcoxon_p_raw","wilcoxon_mode","wilcoxon_p_holm","stars",
            "sign_n","sign_pos","sign_p_raw","sign_p_holm",
            "HL_estimate(ref - method)","median_diff(ref - method)"
        ])

        for metric in METRICS_ALL:
            ser = build_series_by_replicate(metric)
            groups = [np.array(list(d.values()), float) for d in ser.values() if len(d) >= 2]
            if len(groups) >= 2:
                H, p = kruskal(*groups, nan_policy='omit')
                kw_writer.writerow([metric, len(groups), float(H), float(p)])

        for metric in METRICS_ALL:
            ser = build_series_by_replicate(metric)
            methods = sorted(ser.keys())
            pairs = list(itertools.combinations(methods, 2))

            wx_p_raw_pairs = []
            temp_rows = []
            for a, b in pairs:
                ra, rb = ser[a], ser[b]
                common = sorted(set(ra.keys()) & set(rb.keys()))
                if len(common) < 2:
                    continue
                a_vals = np.array([ra[r] for r in common], float)
                b_vals = np.array([rb[r] for r in common], float)

                W, p_wx, n_eff, mode = wilcoxon_exact_or_pratt(a_vals, b_vals)
                p_sign, n_sign, pos = sign_test_two_sided(a_vals, b_vals)
                HL = hodges_lehmann(a_vals, b_vals)
                med = float(np.median(a_vals - b_vals))
                if not np.isnan(p_wx): 
                    wx_p_raw_pairs.append(((a,b), p_wx))
                temp_rows.append((a,b,len(common), n_eff, W, p_wx, mode, n_sign, pos, p_sign, HL, med))

            holm_wx = holm_correction(wx_p_raw_pairs) if wx_p_raw_pairs else {}
            sign_p_raw_pairs = [((a,b), p_sign) for (a,b,_,_,_,_,_,_,_,p_sign,_,_) in temp_rows if not np.isnan(p_sign)]
            holm_sign = holm_correction(sign_p_raw_pairs) if sign_p_raw_pairs else {}

            for a,b,n_common, n_eff,W,p_wx,mode,n_sign,pos,p_sign,HL,med in temp_rows:
                wx_writer.writerow([
                    metric, a, b, n_common,
                    n_eff, W, p_wx, mode, holm_wx.get((a,b), holm_wx.get((b,a), np.nan)),
                    n_sign, pos, p_sign, holm_sign.get((a,b), holm_sign.get((b,a), np.nan)),
                    HL, med
                ])

            ref, p_holm_wx_m, p_holm_sign_m, wilcox_cache_m, sign_cache_m = compute_ref_and_holm(metric)
            if ref is not None:
                for m in methods:
                    if m == ref: 
                        continue
                    n_eff, W, p_wx_raw, mode, HL, med = lookup_pair(wilcox_cache_m, ref, m, (np.nan,)*6)
                    p_wx_holm = lookup_pair(p_holm_wx_m, ref, m, np.nan)
                    n_sign, pos, p_sign_raw = lookup_pair(sign_cache_m, ref, m, (np.nan,)*3)
                    p_sign_holm = lookup_pair(p_holm_sign_m, ref, m, np.nan)
                    p_for_star = p_wx_holm if not np.isnan(p_wx_holm) else p_sign_holm
                    stars = p_to_stars(p_for_star) if not np.isnan(p_for_star) and p_for_star < 0.05 else ''
                    vr_writer.writerow([
                        metric, ref, m, 
                        int(n_eff) if not np.isnan(n_eff) else np.nan,
                        n_eff, W, p_wx_raw, mode, p_wx_holm, stars,
                        n_sign, pos, p_sign_raw, p_sign_holm,
                        HL, med
                    ])

    print("Saved:", kw_path)
    print("Saved:", wx_path)
    print("Saved:", vr_path)

write_stats_csvs()

Writing statistical test results...
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Stats\KruskalWallis [CRYO-SEM X30000].csv
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Stats\Wilcoxon_Pairs [CRYO-SEM X30000].csv
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Stats\Wilcoxon_vsRef [CRYO-SEM X30000].csv


In [None]:
"""This cell creates Bland-Altman plots that show agreement between different methods
by plotting the difference versus the average, helping identify bias and limits of
agreement."""

# Create Bland-Altman plots
print("Creating Bland-Altman plots...")
def bland_altman_plot(a_vals, b_vals, title, outfile_base, dirpath=BA_DIR):
    """Create a Bland-Altman agreement plot"""
    a_vals = np.asarray(a_vals, float)
    b_vals = np.asarray(b_vals, float)
    means = (a_vals + b_vals) / 2.0
    diffs = a_vals - b_vals
    n = len(diffs)

    bias = float(np.mean(diffs))
    sd = float(np.std(diffs, ddof=1)) if n > 1 else 0.0

    # Use t-based quantile for 95% CI with fallback to 1.96
    q = float(analysis_t.ppf(0.975, n-1)) if n > 1 else 1.96

    loa_hi = bias + q * sd
    loa_lo = bias - q * sd

    # Calculate 95% CI for bias and limits of agreement
    se_bias = sd / np.sqrt(n) if n > 0 else np.nan
    se_loa  = sd * np.sqrt((1.0/n) + (q**2)/(2.0*(n-1))) if n > 1 else np.nan

    bias_ci_lo = bias - q*se_bias if np.isfinite(se_bias) else np.nan
    bias_ci_hi = bias + q*se_bias if np.isfinite(se_bias) else np.nan

    loa_hi_ci_lo = loa_hi - q*se_loa if np.isfinite(se_loa) else np.nan
    loa_hi_ci_hi = loa_hi + q*se_loa if np.isfinite(se_loa) else np.nan
    loa_lo_ci_lo = loa_lo - q*se_loa if np.isfinite(se_loa) else np.nan
    loa_lo_ci_hi = loa_lo + q*se_loa if np.isfinite(se_loa) else np.nan

    fig, ax = plt.subplots(figsize=(5.0, 3.6))

    # Add confidence bands first so they appear behind points
    if np.isfinite(bias_ci_lo) and np.isfinite(bias_ci_hi):
        ax.axhspan(bias_ci_lo, bias_ci_hi, alpha=0.12, color='blue', linewidth=0, zorder=1)
    if np.isfinite(loa_hi_ci_lo) and np.isfinite(loa_hi_ci_hi):
        ax.axhspan(loa_hi_ci_lo, loa_hi_ci_hi, alpha=0.15, color='red', linewidth=0, zorder=1)
    if np.isfinite(loa_lo_ci_lo) and np.isfinite(loa_lo_ci_hi):
        ax.axhspan(loa_lo_ci_lo, loa_lo_ci_hi, alpha=0.15, color='red', linewidth=0, zorder=1)

    # Plot the data points
    ax.plot(means, diffs, 'o', markerfacecolor='none', markeredgecolor='black',
            markersize=4, alpha=0.85, zorder=2)

    # Add the main lines
    ax.axhline(bias,  linestyle='-',  color='blue', linewidth=1.1, label='Bias', zorder=3)
    ax.axhline(loa_hi, linestyle='--', color='red',  linewidth=1.0, label='LoA',  zorder=3)
    ax.axhline(loa_lo, linestyle='--', color='red',  linewidth=1.0,               zorder=3)

    ax.set_xlabel("Mean of pair")
    ax.set_ylabel("Difference (A − B)")
    ax.set_title(title)
    ax.grid(axis='both', color=str(0.92), linestyle='-', linewidth=0.5)

    handles = [
        Line2D([0], [0], color='blue', lw=1.2, label='Bias (±95% CI)'),
        Line2D([0], [0], color='red',  lw=1.0, linestyle='--', label='LoA (±95% CI)'),
    ]
    ax.legend(handles=handles, loc='lower right', frameon=True)

    # Add text annotation with statistics
    ann = [
        "Bias=" + str(round(bias,3)) + (" (95% CI " + str(round(bias_ci_lo,3)) + " to " + str(round(bias_ci_hi,3)) + ")" if np.isfinite(bias_ci_lo) else ""),
        "LoA+=" + str(round(loa_hi,3)) + (" (95% CI " + str(round(loa_hi_ci_lo,3)) + " to " + str(round(loa_hi_ci_hi,3)) + ")" if np.isfinite(loa_hi_ci_lo) else ""),
        "LoA−=" + str(round(loa_lo,3)) + (" (95% CI " + str(round(loa_lo_ci_lo,3)) + " to " + str(round(loa_lo_ci_hi,3)) + ")" if np.isfinite(loa_lo_ci_lo) else ""),
    ]
    ax.text(0.02, 0.98, "\n".join(ann), transform=ax.transAxes,
            ha='left', va='top', fontsize=8,
            bbox=dict(boxstyle="round,pad=0.25", facecolor="white", edgecolor="black", linewidth=0.6))

    save_fig(fig, outfile_base, dirpath=dirpath)

if ENABLE_BLAND_ALTMAN:
    for metric in METRICS_ALL:
        # Build series data for Bland-Altman analysis
        series = {}
        for row in perrep_rows:
            v = row[metric]
            if np.isnan(v): 
                continue
            series.setdefault(row["method"], {})[row["replicate"]] = float(v)
        methods = sorted(series.keys())
        if len(methods) < 2:
            continue

        # Choose pairs for comparison
        pairs = []
        if BA_PAIRS_VS_REF_ONLY:
            tbl = table_for_metric(summary_rows, metric)
            if not tbl: 
                continue
            ref = max(tbl, key=lambda d: d["mean"])["method"]
            for m in methods:
                if m == ref: 
                    continue
                pairs.append((ref, m))
        else:
            pairs = list(itertools.combinations(methods, 2))

        for A, B in pairs:
            ra, rb = series.get(A, {}), series.get(B, {})
            common = sorted(set(ra.keys()) & set(rb.keys()))
            if len(common) < BA_MIN_COMMON:
                continue
            a_vals = [ra[r] for r in common]
            b_vals = [rb[r] for r in common]
            title = FAMILY_TAG + " — Bland–Altman [" + NICE.get(metric, metric) + "]\nA=" + A + " vs B=" + B + " (n=" + str(len(common)) + ")"
            base = FAMILY_TAG + " - Bland-Altman [" + NICE.get(metric, metric) + "] A=" + A + " vs B=" + B
            bland_altman_plot(a_vals, b_vals, title, base)

    print("Bland–Altman: EXPORTED to", BA_DIR)
else:
    print("Bland–Altman: SKIPPED (set ENABLE_BLAND_ALTMAN=True to export)")

Creating Bland-Altman plots...
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Figures\Bland-Altman\CRYO-SEM X30000 - Bland-Altman [Dice (F1)] A=ILASTIK vs B=60%.tif
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Figures\Bland-Altman\CRYO-SEM X30000 - Bland-Altman [Dice (F1)] A=ILASTIK vs B=60%.pdf
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Figures\Bland-Altman\CRYO-SEM X30000 - Bland-Altman [Dice (F1)] A=ILASTIK vs B=FREEHAND.tif
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Figures\Bland-Altman\CRYO-SEM X30000 - Bland-Altman [Dice (F1)] A=ILASTIK vs B=FREEHAND.pdf
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\MULTIPLE GS METHOD\Figures\Bland-Altman\CRYO-SEM X30000 - Bland-Altman [Dice (F1)] A=ILASTIK vs B=OTSU.tif
Saved: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROG