In [5]:
import os
import csv
from math import sqrt
import numpy as np
import cv2
import sys
import shutil

# This script assembles group-level reporting for CRYO-SEM X10000.
# It does not calculate Dice, IoU, etc. and it does not build overlays.
# You already did that separately.
#
# What it does:
# 1. Light QC: confirm masks exist and match shape for each replicate.
# 2. You paste the per-replicate metric values you already measured.
# 3. It writes:
#    - Group_PerReplicate_Values [CRYO-SEM X10000].csv
#    - Group_Summary [CRYO-SEM X10000].csv
# 4. It records Python / NumPy / OpenCV / SciPy versions.
# 5. It tries to save a copy of this script next to the CSVs for provenance.

# SciPy is only used for the t critical value when forming 95% CI
try:
    from scipy.stats import t as tdist
    _HAS_SCIPY = True
except Exception:
    _HAS_SCIPY = False

# Optional TIFF readers
try:
    import tifffile as tiff
    _HAS_TIFF = True
except Exception:
    _HAS_TIFF = False

try:
    from PIL import Image
    _HAS_PIL = True
except Exception:
    _HAS_PIL = False


# Raw replicate folders, where the masks live
source_replicate_dirs = [
    r"C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X10000\CRYO-SEM X10000 [1]",
    r"C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X10000\CRYO-SEM X10000 [2]",
    r"C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X10000\CRYO-SEM X10000 [3]",
    r"C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X10000\CRYO-SEM X10000 [4]",
]

# Final output location in the repo
# All group CSVs and the script snapshot are written here
base_out_dir = r"C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML"
group_dir = base_out_dir

# The set of method masks expected in each replicate
method_files = [
    "60%.tif",
    "FREEHAND.tif",
    "OVAL.tif",
    "ILASTIK.tif",
    "OTSU.tif",
    "PLANKSTER.tif",
    "PORED2.tif",
    "SAMJ.tif",
    "SEMI.tif",
    "UNET.tif",
]

# These are the metric names that will appear in the tables
metrics_to_report = [
    "f1_dice",
    "iou_jaccard",
    "precision",
    "recall",
    "specificity",
    "balanced_accuracy",
    "accuracy",
    "mcc"
]


def _read_tiff_any(path):
    """
    Read a TIFF grayscale mask. We only need the shape.
    This tries OpenCV, then tifffile, then PIL.
    Returns numpy array or None.
    """
    img = cv2.imread(path, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_GRAYSCALE)
    if img is not None:
        return img

    if _HAS_TIFF:
        try:
            return tiff.imread(path)
        except Exception:
            pass

    if _HAS_PIL:
        try:
            with Image.open(path) as im:
                # try to preserve depth if 16-bit, else convert to 8-bit L
                if "I;16" in im.mode:
                    im = im.convert("I;16")
                else:
                    im = im.convert("L")
                return np.array(im)
        except Exception:
            pass

    return None


def _to_gray(arr):
    """
    Force the image to a single 2D grayscale array for size checking.
    This mirrors the pipeline logic, but no thresholding.
    """
    if arr.ndim == 2:
        return arr

    if arr.ndim == 3 and arr.shape[-1] in (3, 4):
        a = arr
        # If it's not already uint8, scale to 0..255 uint8
        if a.dtype != np.uint8:
            a_min = float(a.min())
            a_max = float(a.max())
            if a_max > a_min:
                a = ((a - a_min) / (a_max - a_min + 1e-12) * 255.0).astype(np.uint8)
            else:
                a = np.zeros_like(a, dtype=np.uint8)
        if a.shape[-1] == 4:
            a = cv2.cvtColor(a, cv2.COLOR_BGRA2BGR)
        return cv2.cvtColor(a, cv2.COLOR_BGR2GRAY)

    # If it's a stack or something with extra dims, take first slice/channel
    if arr.ndim > 2:
        return _to_gray(arr[..., 0])

    return arr


def load_mask_shape(path):
    """
    Return (height, width) for a mask on disk.
    Raises if unreadable or not 2D.
    """
    if not os.path.exists(path):
        raise FileNotFoundError("missing: " + path)

    raw = _read_tiff_any(path)
    if raw is None:
        raise FileNotFoundError("cannot read image: " + path)

    gray = _to_gray(raw)
    if gray.ndim != 2:
        raise ValueError("not single-channel: " + path)

    return gray.shape


def ci95(mean_val, sd_val, n_val):
    """
    95 percent confidence interval of the mean across replicates.
    Uses a t-based half-width if SciPy is available. Falls back to 1.96 otherwise.
    Returns (low, high). If n < 2, returns (nan, nan).
    """
    if n_val is None or n_val < 2:
        return (np.nan, np.nan)
    if sd_val is None or np.isnan(sd_val):
        return (np.nan, np.nan)

    if _HAS_SCIPY:
        tcrit = float(tdist.ppf(0.975, df=n_val - 1))
    else:
        tcrit = 1.96

    half = tcrit * sd_val / sqrt(n_val)
    return (mean_val - half, mean_val + half)


# quick QC that each replicate has the expected masks and that each mask
# has the same shape as the gold standard for that replicate.
# if this fails you probably mixed resolutions or cropped one mask.

for src_root in source_replicate_dirs:
    replicate_name = os.path.basename(src_root)

    gold_path = os.path.join(src_root, "GOLD STANDARD.tif")
    if not os.path.exists(gold_path):
        print("warning: replicate " + replicate_name + " is missing GOLD STANDARD.tif")
        continue

    gold_shape = load_mask_shape(gold_path)

    for mf in method_files:
        pred_path = os.path.join(src_root, mf)
        if not os.path.exists(pred_path):
            print("warning: replicate " + replicate_name + " is missing " + mf)
            continue

        this_shape = load_mask_shape(pred_path)
        if this_shape != gold_shape:
            raise ValueError(
                "shape mismatch in "
                + replicate_name
                + " for "
                + mf
                + ": "
                + str(this_shape)
                + " vs "
                + str(gold_shape)
            )


# perrep_rows is where you paste your actual measured metrics from each replicate
# each row is:
#   [method_name,
#    replicate_name,
#    f1_dice,
#    iou_jaccard,
#    precision,
#    recall,
#    specificity,
#    balanced_accuracy,
#    accuracy,
#    mcc]
#
# example layout (replace with real numbers):
# ["UNET", "CRYO-SEM X10000 [1]", 0.91, 0.85, 0.90, 0.92, 0.95, 0.94, 0.93, 0.89]

perrep_rows = [
    # paste your rows here, one per method per replicate
]


# make sure output folder exists
os.makedirs(group_dir, exist_ok=True)

# write the raw per-replicate metrics table
perrep_csv = os.path.join(group_dir, "Group_PerReplicate_Values [CRYO-SEM X10000].csv")
with open(perrep_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["method", "replicate"] + metrics_to_report)
    w.writerows(perrep_rows)

print("Wrote per-replicate values: " + perrep_csv)

# now summarize across replicates
# for each method and each metric, compute mean, sd, n, and 95% CI

summary_csv = os.path.join(group_dir, "Group_Summary [CRYO-SEM X10000].csv")
rows_out = []

methods_in_rows = sorted(list(set([row[0] for row in perrep_rows])))

for method_name in methods_in_rows:
    vals_this_method = [r for r in perrep_rows if r[0] == method_name]
    n_img = len(vals_this_method)
    if n_img == 0:
        continue

    # metric columns start at index 2 in each row
    for j, metric_name in enumerate(metrics_to_report, start=2):
        arr = np.array([row[j] for row in vals_this_method], dtype=float)

        mean_k = float(arr.mean())

        if n_img > 1:
            sd_k = float(arr.std(ddof=1))
            lo_k, hi_k = ci95(mean_k, sd_k, n_img)
        else:
            sd_k = float("nan")
            lo_k = float("nan")
            hi_k = float("nan")

        rows_out.append([
            method_name,
            metric_name,
            mean_k,
            sd_k,
            n_img,
            lo_k,
            hi_k
        ])

# write summary table
with open(summary_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["method", "metric", "mean", "sd", "n_images", "ci95_low", "ci95_high"])
    w.writerows(rows_out)

print("Wrote group summary: " + summary_csv)

# try to save a copy of this script in the repo, but do not crash
# if __file__ is not defined (for example, running in a notebook)
try:
    if "__file__" in globals():
        this_script_path = os.path.abspath(__file__)
        script_copy_path = os.path.join(base_out_dir, "cryo_sem_group_summary_snapshot.py")
        if os.path.exists(this_script_path):
            shutil.copyfile(this_script_path, script_copy_path)
            print("Saved script snapshot to: " + script_copy_path)
        else:
            print("Warning: could not locate script file to snapshot.")
    else:
        print("Skipping script snapshot (no __file__ available).")
except Exception as e:
    print("Snapshot copy failed: " + str(e))

# record versions for reproducibility
print("Python: " + sys.version.split()[0])
print("NumPy: " + np.__version__)
print("OpenCV: " + cv2.__version__)
if _HAS_SCIPY:
    print("SciPy for CI: yes (t-based)")
else:
    print("SciPy for CI: no (used normal 1.96 fallback)")

Wrote per-replicate values: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\Group_PerReplicate_Values [CRYO-SEM X10000].csv
Wrote group summary: C:\Users\walsh\Documents\GitHub\AGAROSE-HYDROGEL-TRENDS-USING-AI-ML\Group_Summary [CRYO-SEM X10000].csv
Skipping script snapshot (no __file__ available).
Python: 3.10.18
NumPy: 1.26.4
OpenCV: 4.10.0
SciPy for CI: yes (t-based)
