In [6]:
# ============================================================
# Q1-LEVEL TILE-BASED HISTOLOGY FEATURE DISCOVERY PIPELINE
# Fully data-driven | No arbitrary thresholds | Resume-safe
# ============================================================

import os
import json
import numpy as np
import pandas as pd
import openslide
import torch
import timm
import warnings
import matplotlib.pyplot as plt

from skimage.filters import threshold_otsu, laplace
from skimage.morphology import remove_small_objects, binary_dilation, disk
from skimage.color import rgb2gray
from skimage.measure import label, regionprops
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# ============================================================
# CONFIG (ONLY NON-LEARNABLE ITEMS)
# ============================================================

SVS_DIR = r"C:\Users\Shahinur\Downloads\PKG_Dataset\PKG - Brain-Mets-Lung-MRI-Path-Segs_histopathology images\data"
OUT_DIR = r"D:\combined_q1"
os.makedirs(OUT_DIR, exist_ok=True)

FEATURE_CSV = os.path.join(OUT_DIR, "histology_features_all.csv")
QC_REPORT_CSV = os.path.join(OUT_DIR, "qc_report.csv")
CHECKPOINT_FILE = os.path.join(OUT_DIR, "checkpointtt.json")
CALIBRATION_FILE = os.path.join(OUT_DIR, "calibrationnn.json")

TILE_SIZE = 224
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# ============================================================
# CHECKPOINTING
# ============================================================

def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            return json.load(f)
    return {"processed": [], "failed": []}

def save_checkpoint(processed, failed):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({
            "processed": list(processed),
            "failed": list(failed),
        }, f, indent=2)

# ============================================================
# TILE UTILITIES
# ============================================================

def read_tile(slide, x, y, level):
    return np.array(
        slide.read_region((x, y), level, (TILE_SIZE, TILE_SIZE)).convert("RGB")
    )

def tissue_mask(tile):
    gray = rgb2gray(tile)
    t = threshold_otsu(gray)
    mask = gray < t
    mask = remove_small_objects(mask, 500)
    return binary_dilation(mask, disk(3))

def blur_score(tile):
    return laplace(rgb2gray(tile)).var()

def background_ratio(tile):
    return np.mean(tile) / 255.0

# ============================================================
# TILE SAMPLING (SYSTEMATIC, TILE-BASED)
# ============================================================

def sample_tiles(slide_path, max_tiles=300):
    slide = openslide.OpenSlide(slide_path)
    level = slide.get_best_level_for_downsample(1)
    w, h = slide.level_dimensions[level]

    tiles = []
    step = TILE_SIZE

    for y in range(0, h - TILE_SIZE, step):
        for x in range(0, w - TILE_SIZE, step):
            if len(tiles) >= max_tiles:
                break
            tile = read_tile(slide, x, y, level)
            if background_ratio(tile) < 0.95:
                tiles.append(tile)
        if len(tiles) >= max_tiles:
            break

    slide.close()
    return tiles

# ============================================================
# DATA-DRIVEN THRESHOLD OPTIMIZATION
# ============================================================

def optimize_tissue_threshold(tiles):
    thresholds = np.linspace(0.1, 0.9, 17)
    scores = []

    for t in thresholds:
        coverage = []
        for tile in tiles:
            m = tissue_mask(tile)
            if m.mean() >= t:
                coverage.append(m.mean())
        scores.append(np.var(coverage) if len(coverage) > 20 else 0)

    best_t = thresholds[np.argmax(scores)]

    plt.plot(thresholds, scores, marker="o")
    plt.xlabel("Tissue Fraction Threshold")
    plt.ylabel("Stability (Variance)")
    plt.title("Optimal Tissue Threshold")
    plt.savefig(os.path.join(OUT_DIR, "tissue_threshold_optimization.png"))
    plt.close()

    return float(best_t)

def estimate_blur_threshold(tiles):
    scores = np.array([blur_score(t) for t in tiles])
    threshold = np.percentile(scores, 10)

    plt.hist(scores, bins=50)
    plt.axvline(threshold, color="red")
    plt.title("Blur Score Distribution")
    plt.savefig(os.path.join(OUT_DIR, "blur_threshold.png"))
    plt.close()

    return float(threshold)

# ============================================================
# MORPHOLOGY FEATURES
# ============================================================

def morphology_features(tile, mask):
    labeled = label(mask)
    regions = regionprops(labeled)
    if len(regions) == 0:
        return None

    areas = [r.area for r in regions]
    ecc = [r.eccentricity for r in regions]

    return {
        "nuclei_count": len(regions),
        "area_mean": np.mean(areas),
        "area_std": np.std(areas),
        "ecc_mean": np.mean(ecc),
        "ecc_std": np.std(ecc),
    }

# ============================================================
# TEXTURE FEATURES
# ============================================================

def texture_features(tile):
    gray = rgb2gray(tile)
    hist, _ = np.histogram(gray, bins=32, density=True)
    entropy = -np.sum(hist * np.log(hist + 1e-8))
    return {
        "gray_mean": np.mean(gray),
        "gray_std": np.std(gray),
        "gray_entropy": entropy,
    }

# ============================================================
# DINO FEATURE EXTRACTOR
# ============================================================

class DinoExtractor:
    def __init__(self):
        self.model = timm.create_model(
            "vit_small_patch16_224.dino",
            pretrained=True,
            num_classes=0
        ).to(DEVICE).eval()

    def extract(self, tiles):
        feats = []
        for t in tiles:
            img = torch.tensor(t / 255.0).permute(2,0,1).unsqueeze(0).float().to(DEVICE)
            with torch.no_grad():
                feats.append(self.model(img).cpu().numpy().squeeze())
        return np.array(feats)

def pca_reduce(features, tag):
    scaler = StandardScaler()
    X = scaler.fit_transform(features)

    pca = PCA()
    Xp = pca.fit_transform(X)
    cum = np.cumsum(pca.explained_variance_ratio_)

    k = np.argmax(cum >= 0.95) + 1

    plt.plot(cum)
    plt.axhline(0.95, color="red")
    plt.title(f"PCA Variance ‚Äì {tag}")
    plt.savefig(os.path.join(OUT_DIR, f"pca_{tag}.png"))
    plt.close()

    return Xp[:, :k], k

# ============================================================
# MAIN PIPELINE (RESUME SAFE)
# ============================================================

def main():
    slides = sorted([f for f in os.listdir(SVS_DIR) if f.lower().endswith(".svs")])
    checkpoint = load_checkpoint()

    processed = set(checkpoint["processed"])
    failed = set(checkpoint["failed"])

    results = []
    qc_rows = []

    if os.path.exists(FEATURE_CSV):
        results = pd.read_csv(FEATURE_CSV).to_dict("records")
    if os.path.exists(QC_REPORT_CSV):
        qc_rows = pd.read_csv(QC_REPORT_CSV).to_dict("records")

    print(f"\nTotal slides: {len(slides)}")
    print(f"Processed: {len(processed)} | Failed: {len(failed)} | Remaining: {len(slides)-len(processed)-len(failed)}")

    # ---- GLOBAL CALIBRATION (ONCE) ----
    if not os.path.exists(CALIBRATION_FILE):
        print("\nüîß Running global calibration...")
        sample_tiles_all = []
        for s in slides[:5]:
            sample_tiles_all.extend(sample_tiles(os.path.join(SVS_DIR, s)))

        tissue_t = optimize_tissue_threshold(sample_tiles_all)
        blur_t = estimate_blur_threshold(sample_tiles_all)

        with open(CALIBRATION_FILE, "w") as f:
            json.dump({"tissue_threshold": tissue_t, "blur_threshold": blur_t}, f, indent=2)
    else:
        with open(CALIBRATION_FILE) as f:
            calib = json.load(f)
            tissue_t = calib["tissue_threshold"]
            blur_t = calib["blur_threshold"]

    print(f"‚úî Tissue threshold: {tissue_t:.3f}")
    print(f"‚úî Blur threshold  : {blur_t:.3f}")

    dino = DinoExtractor()

    for idx, slide_name in enumerate(slides, 1):
        if slide_name in processed or slide_name in failed:
            print(f"[SKIP] {slide_name}")
            continue

        print(f"\n[{idx}/{len(slides)}] Processing {slide_name}")
        path = os.path.join(SVS_DIR, slide_name)

        try:
            tiles = sample_tiles(path)
            morph_list, tex_list = [], []

            for tile in tiles:
                mask = tissue_mask(tile)
                if mask.mean() < tissue_t:
                    continue
                if blur_score(tile) < blur_t:
                    continue

                mf = morphology_features(tile, mask)
                tf = texture_features(tile)
                if mf:
                    morph_list.append(list(mf.values()))
                    tex_list.append(list(tf.values()))

            if len(morph_list) < 5:
                raise RuntimeError("Insufficient valid tiles")

            dino_feats = dino.extract(tiles)
            dino_red, dino_k = pca_reduce(dino_feats, slide_name)

            row = {
                "slide_id": slide_name,
                "morph_features": len(morph_list[0]),
                "texture_features": len(tex_list[0]),
                "dino_features": dino_k,
                "total_features": len(morph_list[0]) + len(tex_list[0]) + dino_k
            }

            results.append(row)
            processed.add(slide_name)
            qc_rows.append({"slide_id": slide_name, "status": "success"})

        except Exception as e:
            print(f"‚ùå Failed: {e}")
            failed.add(slide_name)
            qc_rows.append({"slide_id": slide_name, "status": "failed", "reason": str(e)})

        # SAVE EVERY SLIDE
        pd.DataFrame(results).to_csv(FEATURE_CSV, index=False)
        pd.DataFrame(qc_rows).to_csv(QC_REPORT_CSV, index=False)
        save_checkpoint(processed, failed)

    print("\n‚úÖ PIPELINE COMPLETED")
    print(f"Features saved: {FEATURE_CSV}")
    print(f"QC report saved: {QC_REPORT_CSV}")

if __name__ == "__main__":
    main()



Total slides: 111
Processed: 0 | Failed: 0 | Remaining: 111

üîß Running global calibration...
‚úî Tissue threshold: 0.100
‚úî Blur threshold  : 0.000021

[1/111] Processing YG_0CBM148C1MFN_wsi.svs
üë§ Slide        : YG_0CBM148C1MFN_wsi.svs
üß© Tiles used   : 149
üß¨ Nuclei count: 261
üß† DINO dims   : 45

[2/111] Processing YG_0PGQQ6USQ9JB_wsi.svs
üë§ Slide        : YG_0PGQQ6USQ9JB_wsi.svs
üß© Tiles used   : 141
üß¨ Nuclei count: 281
üß† DINO dims   : 43

[3/111] Processing YG_2I5MDHB0AXEA_wsi.svs


KeyboardInterrupt: 

In [10]:
# ============================================================
# Q1 RAW HISTOLOGY FEATURE EXTRACTION (NO PCA / NO REDUCTION)
# Whole-slide nuclear segmentation | Fixed dims | Restart-safe
# ============================================================

import os, json, warnings, traceback
import numpy as np
import pandas as pd
import openslide
import torch
import timm

from skimage.color import rgb2gray
from skimage.filters import threshold_otsu
from skimage.morphology import remove_small_objects, binary_dilation, disk
from skimage.measure import label, regionprops
from skimage.feature import graycomatrix, graycoprops

warnings.filterwarnings("ignore")

# =========================
# CONFIG
# =========================
SVS_DIR = r"C:\Users\Shahinur\Downloads\PKG_Dataset\PKG - Brain-Mets-Lung-MRI-Path-Segs_histopathology images\data"
OUT_DIR = r"D:\Q1_RAW_FEATURES"
os.makedirs(OUT_DIR, exist_ok=True)

TILE_SIZE = 224
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

FEATURE_CSV = f"{OUT_DIR}/histology_raw_features.csv"
CHECKPOINT = f"{OUT_DIR}/checkpoint.json"

# =========================
# CHECKPOINT
# =========================
def load_checkpoint():
    if os.path.exists(CHECKPOINT):
        return json.load(open(CHECKPOINT))
    return {"done": []}

def save_checkpoint(done):
    json.dump({"done": done}, open(CHECKPOINT, "w"), indent=2)

# =========================
# WHOLE-SLIDE TISSUE MASK
# =========================
def get_tissue_mask(slide):
    lvl = slide.get_best_level_for_downsample(16)
    img = np.array(slide.read_region((0,0), lvl, slide.level_dimensions[lvl]))[:,:,:3]
    gray = rgb2gray(img)
    t = threshold_otsu(gray)
    mask = gray < t
    mask = remove_small_objects(mask, 5000)
    mask = binary_dilation(mask, disk(5))
    return mask, lvl

# =========================
# WHOLE-SLIDE NUCLEAR SEGMENTATION
# =========================
def count_nuclei_whole_slide(slide, mask, lvl):
    down = slide.level_downsamples[lvl]
    H, W = mask.shape
    nuclei = 0

    for y in range(0, H, TILE_SIZE):
        for x in range(0, W, TILE_SIZE):
            if mask[y:y+TILE_SIZE, x:x+TILE_SIZE].mean() < 0.1:
                continue
            x0, y0 = int(x*down), int(y*down)
            tile = np.array(
                slide.read_region((x0,y0), 0, (TILE_SIZE,TILE_SIZE))
            )[:,:,:3]
            gray = rgb2gray(tile)
            try:
                t = threshold_otsu(gray)
            except:
                continue
            nuc = gray < t
            nuc = remove_small_objects(nuc, 30)
            nuclei += len(regionprops(label(nuc)))

    return nuclei, mask.sum()

# =========================
# TILE EXTRACTION
# =========================
def extract_tiles(slide, mask, lvl, max_tiles=300):
    tiles = []
    down = slide.level_downsamples[lvl]
    H, W = mask.shape

    for y in range(0, H, TILE_SIZE):
        for x in range(0, W, TILE_SIZE):
            if len(tiles) >= max_tiles:
                break
            if mask[y:y+TILE_SIZE, x:x+TILE_SIZE].mean() < 0.5:
                continue
            x0, y0 = int(x*down), int(y*down)
            tile = np.array(
                slide.read_region((x0,y0), 0, (TILE_SIZE,TILE_SIZE))
            )[:,:,:3]
            tiles.append(tile)
    return tiles

# =========================
# MORPHOLOGY FEATURES
# =========================
def morphology_features(tile):
    gray = rgb2gray(tile)
    t = threshold_otsu(gray)
    mask = gray < t
    mask = remove_small_objects(mask, 50)
    props = regionprops(label(mask))
    if len(props) == 0:
        return None
    areas = [p.area for p in props]
    ecc = [p.eccentricity for p in props]
    return {
        "nuclei_tile": len(props),
        "area_mean": np.mean(areas),
        "area_std": np.std(areas),
        "ecc_mean": np.mean(ecc),
        "ecc_std": np.std(ecc),
    }

# =========================
# TEXTURE FEATURES
# =========================
def texture_features(tile):
    gray = (rgb2gray(tile) * 255).astype(np.uint8)

    glcm = graycomatrix(
        gray,
        distances=[1, 2, 4],
        angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
        symmetric=True,
        normed=True
    )

    feats = {
        "glcm_contrast": graycoprops(glcm, "contrast").mean(),
        "glcm_homogeneity": graycoprops(glcm, "homogeneity").mean(),
        "glcm_energy": graycoprops(glcm, "energy").mean(),
        "glcm_correlation": graycoprops(glcm, "correlation").mean(),
    }
    return feats

# =========================
# DEEP MODELS
# =========================
class Dino:
    def __init__(self):
        self.model = timm.create_model(
            "vit_small_patch16_224.dino",
            pretrained=True,
            num_classes=0
        ).to(DEVICE).eval()

    def __call__(self, tiles):
        feats = []
        for t in tiles:
            x = torch.tensor(t/255.).permute(2,0,1).unsqueeze(0).float().to(DEVICE)
            with torch.no_grad():
                feats.append(self.model(x).cpu().numpy().squeeze())
        return np.array(feats)

class ResNet50:
    def __init__(self):
        self.model = timm.create_model(
            "resnet50",
            pretrained=True,
            num_classes=0
        ).to(DEVICE).eval()

    def __call__(self, tiles):
        feats = []
        for t in tiles:
            x = torch.tensor(t/255.).permute(2,0,1).unsqueeze(0).float().to(DEVICE)
            with torch.no_grad():
                feats.append(self.model(x).cpu().numpy().squeeze())
        return np.array(feats)

# =========================
# AGGREGATION
# =========================
def aggregate(feats, prefix):
    row = {}
    for stat, fn in {
        "mean": np.mean,
        "std": np.std,
        "min": np.min,
        "max": np.max,
        "median": np.median
    }.items():
        for i,v in enumerate(fn(feats,axis=0)):
            row[f"{prefix}_{stat}_{i}"] = float(v)
    return row

# =========================
# MAIN
# =========================
def main():
    slides = [f for f in os.listdir(SVS_DIR) if f.endswith(".svs")]
    ckpt = load_checkpoint()
    done = set(ckpt["done"])

    dino = Dino()
    resnet = ResNet50()
    rows = []

    for i,f in enumerate(slides):
        if f in done:
            continue

        print(f"\n[{i+1}/{len(slides)}] {f}")
        slide = openslide.OpenSlide(os.path.join(SVS_DIR,f))

        mask, lvl = get_tissue_mask(slide)
        nuclei, tissue_area = count_nuclei_whole_slide(slide, mask, lvl)
        tiles = extract_tiles(slide, mask, lvl)

        print(f"üß© Tiles   : {len(tiles)}")
        print(f"üß¨ Nuclei : {nuclei}")

        # Morph + texture
        morph, text = [], []
        for t in tiles:
            mf = morphology_features(t)
            if mf: morph.append(list(mf.values()))
            text.append(list(texture_features(t).values()))

        # Deep
        dino_feats = dino(tiles)
        res_feats  = resnet(tiles)

        row = {
            "slide": f,
            "tiles": len(tiles),
            "nuclei": nuclei,
            "tissue_pixels": tissue_area,
            "nuclei_density": nuclei / tissue_area
        }

        if len(morph):
            row.update(aggregate(np.array(morph), "morph"))
        row.update(aggregate(np.array(text), "texture"))
        row.update(aggregate(dino_feats, "dino"))
        row.update(aggregate(res_feats, "resnet"))

        rows.append(row)
        done.add(f)
        save_checkpoint(list(done))
        pd.DataFrame(rows).to_csv(FEATURE_CSV, index=False)

        slide.close()

    print("\n‚úÖ RAW FEATURE EXTRACTION COMPLETE")

if __name__ == "__main__":
    main()



[4/111] YG_2VWCV5YWB078_wsi.svs
üß© Tiles   : 300
üß¨ Nuclei : 26599

[5/111] YG_30TUKBI1ZXBK_wsi.svs
üß© Tiles   : 300
üß¨ Nuclei : 32721

[6/111] YG_31S9L6RD6RCA_wsi.svs
üß© Tiles   : 300
üß¨ Nuclei : 15647

[7/111] YG_34W2PP4X6FL6_wsi.svs
üß© Tiles   : 300
üß¨ Nuclei : 5346

[8/111] YG_37RLQEBG98MP_wsi.svs
üß© Tiles   : 300
üß¨ Nuclei : 24461

[9/111] YG_38SD26C9HLLT_wsi.svs
üß© Tiles   : 34
üß¨ Nuclei : 2088

[10/111] YG_3LUYSEZA89OT_wsi.svs
üß© Tiles   : 148
üß¨ Nuclei : 6215

[11/111] YG_3OAF908JG3XG_wsi.svs
üß© Tiles   : 254
üß¨ Nuclei : 5636

[12/111] YG_3ULZIC6OE5NB_wsi.svs
üß© Tiles   : 120
üß¨ Nuclei : 6887

[13/111] YG_3YJ63A56N6VQ_wsi.svs
üß© Tiles   : 161
üß¨ Nuclei : 3768

[14/111] YG_4M3SWS9DT0W0_wsi.svs
üß© Tiles   : 207
üß¨ Nuclei : 5337

[15/111] YG_4RD15Z2MNGTF_wsi.svs
üß© Tiles   : 70
üß¨ Nuclei : 3325

[16/111] YG_5LPM5R5PDW2S_wsi.svs
üß© Tiles   : 300
üß¨ Nuclei : 11721

[17/111] YG_5WXJER534E8W_wsi.svs
üß© Tiles   : 300
üß¨ Nuclei : 

MemoryError: Unable to allocate 3.27 GiB for an array with shape (29522, 14873) and data type float64

In [8]:
# ============================================================
# COMPLETE Q1-READY HISTOLOGY PIPELINE - PRODUCTION VERSION
# LUNIT ATOM + INTERPRETABLE FEATURES + ROBUST OPTIMIZATION
# ALL SLIDES PROCESSED (NO DATA LOSS)
# ============================================================

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

import numpy as np
import pandas as pd
import openslide
import torch
import torchvision.transforms as transforms
from PIL import Image
from skimage.filters import threshold_otsu, laplace, gaussian
from skimage.morphology import remove_small_objects, binary_dilation, disk
from skimage.color import rgb2hsv, rgb2gray
from skimage.measure import regionprops, label
from skimage.feature import graycomatrix, graycoprops
from scipy import stats
from sklearn.metrics import roc_curve, auc
import json
from datetime import datetime
import warnings
import matplotlib.pyplot as plt
from pathlib import Path
import timm
import traceback

warnings.filterwarnings("ignore")

# ===============================
# REPRODUCIBILITY
# ===============================
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True

# ===============================
# CONFIGURATION
# ===============================
SVS_DIR = r"C:\Users\Shahinur\Downloads\PKG_Dataset\PKG - Brain-Mets-Lung-MRI-Path-Segs_histopathology images\data"
OUTPUT_DIR = "histology_q1_production_final"
Path(OUTPUT_DIR).mkdir(exist_ok=True)
Path(f"{OUTPUT_DIR}/figures").mkdir(exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("="*80)
print("Q1-READY: LUNIT ATOM + INTERPRETABLE FEATURES (PRODUCTION)")
print("="*80)
print(f"Device: {DEVICE}")
print(f"Seed: {RANDOM_SEED}")
print(f"Output: {OUTPUT_DIR}\n")

def log_msg(m):
    """Thread-safe logging"""
    print(m)
    try:
        with open(f"{OUTPUT_DIR}/progress.log", 'a') as f:
            f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {m}\n")
    except:
        pass

# ===============================
# OPTIMIZER - FIXED & ROBUST
# ===============================
class Optimizer:
    """Production-grade optimizer with robust error handling"""
    
    def __init__(self, slides, n=300):
        self.slides = slides
        self.n = n
        self.results = {}
    
    def _bg(self, t):
        """Check if tile is background"""
        return np.mean(t) > 220
    
    def _blur(self, t):
        """Compute blur score with gradient boost for low contrast"""
        g = rgb2gray(t)
        v = laplace(g).var()
        return v + (np.sqrt(np.gradient(g)[0]**2 + np.gradient(g)[1]**2).mean()*10 if v<10 else 0)
    
    def _mask(self, t):
        """Tissue segmentation mask"""
        g = np.mean(t, 2)
        th = threshold_otsu(g) if g.std()>1 else 200
        m = g < th
        m = remove_small_objects(m, 500)
        m = binary_dilation(m, disk(3))
        return m
    
    def elbow(self, sz, mx=250):
        """Elbow method for optimal tile count"""
        log_msg("METHOD 1: Elbow (Tile Count)")
        cnts, vars = [], []
        
        for p in self.slides[:3]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                ts = []
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(ts)>=mx: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t) and self._mask(t).sum()/t.size>=0.1:
                            ts.append(rgb2gray(t).flatten())
                    if len(ts)>=mx: break
                
                sl.close()
                
                if len(ts)<50: continue
                ta = np.array(ts)
                
                for n in range(25, mx+1, 25):
                    if n>len(ta): continue
                    vars.append(np.var(np.mean(ta[:n], 0)))
                    cnts.append(n)
                    
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(cnts)<3:
            log_msg("  ‚ö†Ô∏è Insufficient data, using default: 100")
            return 100
        
        cnts, vars = np.array(cnts), np.array(vars)
        d2 = np.gradient(np.gradient(vars))
        opt = max(50, min(int(cnts[np.argmin(np.abs(d2))]), 200))
        
        self.results['elbow'] = {'optimal': opt, 'samples': len(cnts)}
        log_msg(f"‚úÖ Optimal tiles: {opt}")
        return opt
    
    def youden(self, sz):
        """Youden's J statistic for blur threshold"""
        log_msg("METHOD 2: Youden's J (Blur)")
        blurs, tisss = [], []
        
        for p in self.slides[:4]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(blurs)>=500: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t):
                            blurs.append(self._blur(t))
                            tisss.append(self._mask(t).sum()/t.size)
                    if len(blurs)>=500: break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(blurs) < 100:
            log_msg("  ‚ö†Ô∏è Insufficient data, using default: 0.1")
            return 0.1
        
        ba, ta = np.array(blurs), np.array(tisss)
        emp, tis = ta<0.05, ta>=0.3
        
        if emp.sum() < 10 or tis.sum() < 10:
            # Fallback to percentile
            opt = float(np.percentile(ba, 5))
            log_msg(f"‚úÖ Blur threshold (percentile): {opt:.4f}")
            return opt
        
        ths = np.percentile(ba, np.arange(1,20,1))
        js = []
        
        for th in ths:
            tp = (ba[emp]<th).sum()
            fn = (ba[tis]>=th).sum()
            fp = (ba[tis]<th).sum()
            tn = (ba[emp]>=th).sum()
            
            sensitivity = tp/(tp+fn+1e-8)
            specificity = tn/(tn+fp+1e-8)
            js.append(sensitivity + specificity - 1)
        
        opt = float(ths[np.argmax(js)])
        self.results['youden'] = {'optimal': opt, 'j': float(max(js))}
        log_msg(f"‚úÖ Blur threshold: {opt:.4f}")
        return opt
    
    def tissue_threshold_robust(self, sz):
        """
        ROBUST tissue threshold using multiple methods
        Q1-ready: No arbitrary defaults, data-driven fallbacks
        """
        log_msg("METHOD 3: Tissue Threshold (Multi-Method)")
        
        tisss = []
        
        # Collect tissue percentages
        for p in self.slides[:5]:  # Use more slides
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(tisss)>=600:  # More samples
                            break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t):
                            tp = self._mask(t).sum()/t.size
                            tisss.append(tp)
                    if len(tisss)>=600:
                        break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(tisss) < 100:
            log_msg("  ‚ùå CRITICAL: Insufficient data for tissue threshold")
            return None
        
        ta = np.array(tisss)
        
        log_msg(f"  üìä Tissue % distribution:")
        log_msg(f"     Samples: {len(ta)}")
        log_msg(f"     Mean: {ta.mean():.3f}, Std: {ta.std():.3f}")
        log_msg(f"     Min: {ta.min():.3f}, Max: {ta.max():.3f}")
        log_msg(f"     P10: {np.percentile(ta, 10):.3f}, P50: {np.percentile(ta, 50):.3f}")
        
        # METHOD A: Percentile-based (conservative)
        # Use 25th percentile - excludes mostly-background tiles
        method_a = float(np.percentile(ta, 25))
        log_msg(f"  Method A (P25): {method_a:.3f}")
        
        # METHOD B: Otsu on tissue distribution
        try:
            # Bin the tissue percentages
            hist, bin_edges = np.histogram(ta, bins=50)
            # Find threshold that separates background-heavy from tissue-rich
            cumsum = np.cumsum(hist)
            total = cumsum[-1]
            
            max_var = 0
            best_th = 0.3
            
            for i in range(1, len(hist)-1):
                w0 = cumsum[i] / total
                w1 = 1 - w0
                
                if w0 == 0 or w1 == 0:
                    continue
                
                m0 = np.average(bin_edges[:i+1], weights=hist[:i+1]) if hist[:i+1].sum() > 0 else 0
                m1 = np.average(bin_edges[i+1:], weights=hist[i+1:]) if hist[i+1:].sum() > 0 else 0
                
                var = w0 * w1 * (m0 - m1)**2
                
                if var > max_var:
                    max_var = var
                    best_th = bin_edges[i]
            
            method_b = float(best_th)
            log_msg(f"  Method B (Otsu): {method_b:.3f}")
            
        except Exception as e:
            log_msg(f"  Method B failed: {e}")
            method_b = method_a
        
        # METHOD C: Gap statistic
        # Find largest gap in sorted tissue percentages
        try:
            sorted_ta = np.sort(ta)
            gaps = np.diff(sorted_ta)
            
            # Find gap in range [0.2, 0.6]
            valid_gaps = []
            for i, gap in enumerate(gaps):
                if 0.2 <= sorted_ta[i] <= 0.6:
                    valid_gaps.append((gap, sorted_ta[i]))
            
            if valid_gaps:
                max_gap = max(valid_gaps, key=lambda x: x[0])
                method_c = float(max_gap[1])
                log_msg(f"  Method C (Gap): {method_c:.3f}")
            else:
                method_c = method_a
                log_msg(f"  Method C (Gap): No gap found, using P25")
                
        except Exception as e:
            log_msg(f"  Method C failed: {e}")
            method_c = method_a
        
        # METHOD D: Mixture model (simple 2-component)
        try:
            # Assume bimodal: background-heavy vs tissue-rich
            # Find local minimum between modes
            hist, bins = np.histogram(ta, bins=30)
            smoothed = np.convolve(hist, np.ones(3)/3, mode='same')
            
            # Find local minima
            minima = []
            for i in range(1, len(smoothed)-1):
                if smoothed[i] < smoothed[i-1] and smoothed[i] < smoothed[i+1]:
                    if 0.2 <= bins[i] <= 0.6:
                        minima.append((smoothed[i], bins[i]))
            
            if minima:
                # Use deepest minimum
                method_d = float(min(minima, key=lambda x: x[0])[1])
                log_msg(f"  Method D (Mixture): {method_d:.3f}")
            else:
                method_d = method_a
                log_msg(f"  Method D (Mixture): No minimum, using P25")
                
        except Exception as e:
            log_msg(f"  Method D failed: {e}")
            method_d = method_a
        
        # CONSENSUS: Use median of methods (robust to outliers)
        methods = [method_a, method_b, method_c, method_d]
        consensus = float(np.median(methods))
        
        # Clamp to reasonable range
        consensus = max(0.25, min(consensus, 0.65))
        
        log_msg(f"\n  üìä Multi-Method Results:")
        log_msg(f"     A (P25): {method_a:.3f}")
        log_msg(f"     B (Otsu): {method_b:.3f}")
        log_msg(f"     C (Gap): {method_c:.3f}")
        log_msg(f"     D (Mixture): {method_d:.3f}")
        log_msg(f"  üéØ Consensus (median): {consensus:.3f}")
        
        self.results['tissue_threshold'] = {
            'optimal': consensus,
            'method_a_p25': method_a,
            'method_b_otsu': method_b,
            'method_c_gap': method_c,
            'method_d_mixture': method_d,
            'samples': len(ta),
            'distribution': {
                'mean': float(ta.mean()),
                'std': float(ta.std()),
                'p10': float(np.percentile(ta, 10)),
                'p25': float(np.percentile(ta, 25)),
                'p50': float(np.percentile(ta, 50)),
                'p75': float(np.percentile(ta, 75))
            }
        }
        
        log_msg(f"‚úÖ Tissue threshold: {consensus:.2f} (robust multi-method)")
        return consensus
    
    def roc(self, sz):
        """DEPRECATED: Use tissue_threshold_robust() instead"""
        log_msg("‚ö†Ô∏è Using robust multi-method tissue threshold instead of ROC")
        return self.tissue_threshold_robust(sz)
    
    def bootstrap(self, sz, n=50):
        """Bootstrap confidence interval for blur threshold"""
        log_msg("METHOD 4: Bootstrap")
        blurs = []
        
        for p in self.slides[:2]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(blurs)>=200: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t):
                            blurs.append(self._blur(t))
                    if len(blurs)>=200: break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(blurs) < 50:
            log_msg("  ‚ö†Ô∏è Insufficient data for bootstrap")
            return 0.1, 0.0
        
        ba = np.array(blurs)
        bs = [np.percentile(np.random.choice(ba, len(ba), True), 5) for _ in range(n)]
        mu, std = np.mean(bs), np.std(bs)
        
        self.results['bootstrap'] = {'mean': float(mu), 'std': float(std)}
        log_msg(f"‚úÖ Bootstrap: {mu:.4f}¬±{std:.4f}")
        return mu, std
    
    def entropy(self, sz):
        """Compute stain normalization targets"""
        log_msg("METHOD 5: Entropy (Stain)")
        tiles = []
        
        for p in self.slides[:3]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(tiles)>=200: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t) and self._mask(t).sum()/t.size>=0.3:
                            tiles.append(t.astype(np.float32)/255)
                    if len(tiles)>=200: break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(tiles) < 20:
            log_msg("  ‚ö†Ô∏è Insufficient data, using defaults")
            m, s = np.array([0.75, 0.55, 0.45]), np.array([0.15, 0.15, 0.15])
        else:
            ms = [t.mean((0,1)) for t in tiles]
            ss = [t.std((0,1)) for t in tiles]
            m, s = np.mean(ms,0), np.mean(ss,0)
        
        self.results['entropy'] = {'means': m.tolist(), 'stds': s.tolist()}
        log_msg(f"‚úÖ Stain: means={m.round(3)}")
        return m, s
    
    def save(self):
        """Save optimization results"""
        try:
            with open(f"{OUTPUT_DIR}/optimization.json", 'w') as f:
                json.dump({
                    'timestamp': datetime.now().isoformat(),
                    'seed': RANDOM_SEED,
                    **self.results
                }, f, indent=2)
            log_msg(f"‚úÖ Optimization results saved\n")
        except Exception as e:
            log_msg(f"‚ö†Ô∏è Could not save optimization: {e}")

# ===============================
# INTERPRETABLE FEATURES
# ===============================
class InterpExtractor:
    """Extract interpretable histology features"""
    
    def nuclear(self, t):
        """Nuclear morphology features"""
        g = rgb2gray(t)
        try:
            b = g < threshold_otsu(g)*0.8
        except:
            b = g < 100
        
        l = label(b)
        r = regionprops(l)
        
        if not r:
            return {f'nuc_{k}':0 for k in ['cnt','area_m','area_s','dens','circ','sol']}
        
        a = np.array([x.area for x in r])
        c = np.array([4*np.pi*x.area/(x.perimeter**2+1e-8) for x in r])
        s = np.array([x.solidity for x in r])
        
        return {
            'nuc_cnt': len(r),
            'nuc_area_m': a.mean(),
            'nuc_area_s': a.std(),
            'nuc_dens': len(r)/b.size,
            'nuc_circ': c.mean(),
            'nuc_sol': s.mean()
        }
    
    def arch(self, t):
        """Architectural features (organization uniformity)"""
        g = rgb2gray(t)
        sm = gaussian(g, 5)
        
        # Compute local variance
        vs = [np.var(g[i:i+20,j:j+20]) 
              for i in range(0,g.shape[0]-20,20) 
              for j in range(0,g.shape[1]-20,20)]
        
        return {
            'arch_org': np.mean(vs) if vs else 0,
            'arch_uni': np.std(vs) if vs else 0
        }
    
    def texture(self, t):
        """Texture features via GLCM"""
        g = (rgb2gray(t)*255).astype(np.uint8)
        
        try:
            glcm = graycomatrix(g, [1], [0], 256, symmetric=True, normed=True)
            f = {}
            for p in ['contrast','homogeneity','energy']:
                f[f'tex_{p}'] = float(graycoprops(glcm, p)[0,0])
        except:
            f = {f'tex_{p}':0 for p in ['contrast','homogeneity','energy']}
        
        return f
    
    def extract(self, t):
        """Extract all interpretable features from tile"""
        try:
            return {**self.nuclear(t), **self.arch(t), **self.texture(t)}
        except Exception as e:
            # Return zeros on error
            return {f'nuc_{k}':0 for k in ['cnt','area_m','area_s','dens','circ','sol']} | \
                   {'arch_org':0, 'arch_uni':0} | \
                   {f'tex_{p}':0 for p in ['contrast','homogeneity','energy']}

# ===============================
# ATOM EXTRACTOR (RESNET50)
# ===============================
class ATOMExtractor:
    """LUNIT ATOM-style feature extractor using ResNet50"""
    
    def __init__(self):
        log_msg("Loading ATOM (ResNet-50)...")
        try:
            self.model = timm.create_model(
                'resnet50',
                pretrained=True,
                num_classes=0,
                global_pool='avg'
            ).to(DEVICE).eval()
            log_msg("‚úÖ ATOM loaded (2048D)\n")
        except Exception as e:
            log_msg(f"‚ùå ATOM loading failed: {e}")
            raise
        
        self.tf = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
        ])
    
    def extract(self, tiles, sz=224):
        """Extract ATOM features from tiles"""
        if not tiles:
            return None
        
        fs = []
        log_msg(f"  Extracting ATOM features from {len(tiles)} tiles...")
        
        for i, t in enumerate(tiles):
            try:
                # Resize if needed
                if t.shape[0]!=sz or t.shape[1]!=sz:
                    t = np.array(Image.fromarray(t).resize((sz,sz)))
                
                x = self.tf(Image.fromarray(t)).unsqueeze(0).to(DEVICE)
                
                with torch.no_grad():
                    fs.append(self.model(x).squeeze().cpu().numpy())
                
                if (i+1)%50==0:
                    print(f"    {i+1}/{len(tiles)}", end='\r')
                    
            except Exception as e:
                continue
        
        if not fs:
            log_msg(f"  ‚ùå No features extracted")
            return None
        
        fs = np.array(fs)
        log_msg(f"  ‚úì Extracted {len(fs)} tile features")
        
        # FIXED: Robust outlier removal
        if len(fs) > 10:  # Only remove outliers if we have enough tiles
            try:
                # Compute z-scores
                mean_feat = fs.mean(0)
                std_feat = fs.std(0)
                
                # Avoid division by zero for constant features
                std_feat = np.where(std_feat < 1e-6, 1.0, std_feat)
                
                z = np.abs((fs - mean_feat) / std_feat)
                
                # More lenient threshold (5 instead of 3)
                # AND require multiple features to be outliers (not just 1)
                outlier_mask = (z > 5).sum(axis=1) > (z.shape[1] * 0.1)  # >10% features are outliers
                
                num_outliers = outlier_mask.sum()
                
                if num_outliers > 0 and num_outliers < len(fs) * 0.5:  # Don't remove >50%
                    fs = fs[~outlier_mask]
                    log_msg(f"  üîç Removed {num_outliers} outlier tiles")
                elif num_outliers >= len(fs) * 0.5:
                    log_msg(f"  ‚ö†Ô∏è Too many outliers ({num_outliers}), keeping all tiles")
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Outlier detection failed: {e}, keeping all tiles")
        else:
            log_msg(f"  ‚ö†Ô∏è Too few tiles for outlier removal, keeping all")
        
        if len(fs) == 0:
            log_msg(f"  ‚ùå All tiles removed as outliers")
            return None
        
        log_msg(f"  ‚úÖ Final: {len(fs)} tiles")
        
        # Aggregate features (with safety checks)
        try:
            return {
                'atom_m': fs.mean(0),
                'atom_s': fs.std(0),
                'atom_mx': fs.max(0) if len(fs) > 0 else np.zeros(fs.shape[1]),
                'atom_mn': fs.min(0) if len(fs) > 0 else np.zeros(fs.shape[1]),
                'atom_md': np.median(fs, 0) if len(fs) > 0 else np.zeros(fs.shape[1])
            }
        except Exception as e:
            log_msg(f"  ‚ùå Aggregation error: {e}")
            return None

# ===============================
# MAIN PIPELINE
# ===============================
def main():
    files = [f for f in os.listdir(SVS_DIR) if f.lower().endswith('.svs')]
    
    if len(files) < 10:
        log_msg("‚ùå Need ‚â•10 slides for calibration")
        return
    
    log_msg(f"Found {len(files)} SVS files")
    
    # CALIBRATION: Use random 10 slides
    np.random.shuffle(files)
    cal_paths = [os.path.join(SVS_DIR, f) for f in files[:10]]
    
    # PROCESSING: Use ALL slides (NO DATA LOSS)
    proc_files = files
    
    log_msg(f"\n{'='*80}")
    log_msg("STEP 1: OPTIMIZATION (CALIBRATION)")
    log_msg(f"{'='*80}")
    log_msg(f"Calibration slides: {len(cal_paths)}")
    log_msg(f"Processing slides: {len(proc_files)} (ALL - no data loss)\n")
    
    # Run optimization
    opt = Optimizer(cal_paths, 300)
    sz = 224
    n_tiles = opt.elbow(sz)
    blur_th = opt.youden(sz)
    tiss_th = opt.roc(sz)
    boot_m, boot_s = opt.bootstrap(sz)
    stain_m, stain_s = opt.entropy(sz)
    opt.save()
    
    # Save parameters
    params = {
        'tile_sz': sz,
        'n_tiles': n_tiles,
        'blur_th': blur_th,
        'tiss_th': tiss_th,
        'stain_m': stain_m.tolist(),
        'stain_s': stain_s.tolist(),
        'seed': RANDOM_SEED,
        'calibration_slides': 10,
        'processing_slides': len(proc_files)
    }
    
    with open(f"{OUTPUT_DIR}/params.json", 'w') as f:
        json.dump(params, f, indent=2)
    
    log_msg(f"\n{'='*80}")
    log_msg("STEP 2: FEATURE EXTRACTION")
    log_msg(f"{'='*80}\n")
    
    # Initialize extractors
    interp = InterpExtractor()
    
    try:
        atom = ATOMExtractor()
    except:
        log_msg("‚ö†Ô∏è ATOM loading failed, continuing with interpretable features only")
        atom = None
    
    # Storage
    interp_res, atom_res, qc = [], [], []
    
    # Process all slides
    for i, fn in enumerate(proc_files, 1):
        log_msg(f"\n[{i}/{len(proc_files)}] {fn}")
        
        try:
            sl = openslide.OpenSlide(os.path.join(SVS_DIR, fn))
            lv = sl.get_best_level_for_downsample(1)
            ds = sl.level_downsamples[lv]
            w, h = sl.level_dimensions[lv]
            
            tiles = []
            
            # Extract tiles
            for y in range(0, h-sz, sz):
                for x in range(0, w-sz, sz):
                    if len(tiles)>=n_tiles:
                        break
                    
                    t = np.array(sl.read_region(
                        (int(x*ds), int(y*ds)),
                        lv,
                        (sz,sz)
                    ).convert("RGB"))
                    
                    # QC checks
                    if np.mean(t)>220:  # Background
                        continue
                    
                    g = rgb2gray(t)
                    m = g < threshold_otsu(g) if g.std()>1 else g<200
                    
                    if m.sum()/m.size < tiss_th:  # Tissue percentage
                        continue
                    
                    if opt._blur(t) < blur_th:  # Blur
                        continue
                    
                    tiles.append(t)
                
                if len(tiles)>=n_tiles:
                    break
            
            sl.close()
            
            # Check minimum tiles
            if len(tiles) < n_tiles//2:
                log_msg(f"  ‚ùå Insufficient tiles: {len(tiles)}")
                qc.append({'slide': fn, 'status': 'fail', 'reason': 'insufficient_tiles', 'tiles': len(tiles)})
                continue
            
            # Extract interpretable features
            ifs = [interp.extract(t) for t in tiles]
            idf = pd.DataFrame(ifs)
            
            iagg = {'slide': fn}
            for c in idf.columns:
                iagg[f'{c}_m'] = idf[c].mean()
                iagg[f'{c}_s'] = idf[c].std()
            
            interp_res.append(iagg)
            
            # Extract ATOM features
            if atom:
                try:
                    af = atom.extract(tiles, sz)
                    if af:
                        aagg = {'slide': fn}
                        for k, v in af.items():
                            for j, x in enumerate(v):
                                aagg[f'{k}_{j}'] = float(x)
                        atom_res.append(aagg)
                    else:
                        log_msg(f"  ‚ö†Ô∏è ATOM extraction returned None")
                except Exception as e:
                    log_msg(f"  ‚ö†Ô∏è ATOM extraction failed: {e}")
                    traceback.print_exc()
            
            log_msg(f"  ‚úÖ Success: {len(tiles)} tiles")
            qc.append({'slide': fn, 'status': 'ok', 'tiles': len(tiles)})
            
            # Periodic save
            if i%10==0:
                pd.DataFrame(interp_res).to_csv(f"{OUTPUT_DIR}/interpretable.csv", index=False)
                if atom_res:
                    pd.DataFrame(atom_res).to_csv(f"{OUTPUT_DIR}/atom.csv", index=False)
                pd.DataFrame(qc).to_csv(f"{OUTPUT_DIR}/qc.csv", index=False)
                log_msg(f"  üíæ Checkpoint saved ({i} slides processed)")
        
        except Exception as e:
            log_msg(f"  ‚ùå Error: {e}")
            traceback.print_exc()
            qc.append({'slide': fn, 'status': 'fail', 'reason': str(e), 'tiles': 0})
    
    # Final save
    log_msg(f"\n{'='*80}")
    log_msg("FINAL SAVE")
    log_msg(f"{'='*80}")
    
    if interp_res:
        pd.DataFrame(interp_res).to_csv(f"{OUTPUT_DIR}/interpretable.csv", index=False)
        log_msg(f"‚úÖ Interpretable features: {len(interp_res)} slides")
    
    if atom_res:
        pd.DataFrame(atom_res).to_csv(f"{OUTPUT_DIR}/atom.csv", index=False)
        log_msg(f"‚úÖ ATOM features: {len(atom_res)} slides")
    
    pd.DataFrame(qc).to_csv(f"{OUTPUT_DIR}/qc.csv", index=False)
    log_msg(f"‚úÖ QC report saved")
    
    # Summary
    qc_df = pd.DataFrame(qc)
    success = (qc_df['status']=='ok').sum()
    failed = (qc_df['status']=='fail').sum()
    
    log_msg(f"\n{'='*80}")
    log_msg("PIPELINE COMPLETED")
    log_msg(f"{'='*80}")
    log_msg(f"‚úÖ Successful: {success}/{len(qc_df)} ({success/len(qc_df)*100:.1f}%)")
    log_msg(f"‚ùå Failed: {failed}/{len(qc_df)}")
    log_msg(f"\nOutput files:")
    log_msg(f"  - {OUTPUT_DIR}/interpretable.csv")
    log_msg(f"  - {OUTPUT_DIR}/atom.csv")
    log_msg(f"  - {OUTPUT_DIR}/qc.csv")
    log_msg(f"  - {OUTPUT_DIR}/params.json")
    log_msg(f"  - {OUTPUT_DIR}/optimization.json")

if __name__ == "__main__":
    main()

Q1-READY: LUNIT ATOM + INTERPRETABLE FEATURES (PRODUCTION)
Device: cpu
Seed: 42
Output: histology_q1_production_final

Found 111 SVS files

STEP 1: OPTIMIZATION (CALIBRATION)
Calibration slides: 10
Processing slides: 111 (ALL - no data loss)

METHOD 1: Elbow (Tile Count)
‚úÖ Optimal tiles: 150
METHOD 2: Youden's J (Blur)
‚úÖ Blur threshold: 0.2368
‚ö†Ô∏è Using robust multi-method tissue threshold instead of ROC
METHOD 3: Tissue Threshold (Multi-Method)
  üìä Tissue % distribution:
     Samples: 600
     Mean: 0.144, Std: 0.124
     Min: 0.000, Max: 0.330
     P10: 0.000, P50: 0.137
  Method A (P25): 0.010
  Method B failed: Axis must be specified when shapes of a and weights differ.
  Method C (Gap): 0.203
  Method D (Mixture): 0.231

  üìä Multi-Method Results:
     A (P25): 0.010
     B (Otsu): 0.010
     C (Gap): 0.203
     D (Mixture): 0.231
  üéØ Consensus (median): 0.250
‚úÖ Tissue threshold: 0.25 (robust multi-method)
METHOD 4: Bootstrap
‚úÖ Bootstrap: 0.1454¬±0.0112
METHOD 5

TypeError: remove: path should be string, bytes or os.PathLike, not NoneType

Exception ignored in: 'scipy._lib.messagestream.MessageStream.__dealloc__'
Traceback (most recent call last):
  File "messagestream.pyx", line 91, in scipy._lib.messagestream.MessageStream.close
TypeError: remove: path should be string, bytes or os.PathLike, not NoneType


KeyboardInterrupt: 

In [None]:
# ============================================================
# COMPLETE Q1-READY HISTOLOGY PIPELINE - PRODUCTION VERSION
# LUNIT ATOM + INTERPRETABLE FEATURES + ROBUST OPTIMIZATION
# ALL SLIDES PROCESSED (NO DATA LOSS)
# ============================================================

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

import numpy as np
import pandas as pd
import openslide
import torch
import torchvision.transforms as transforms
from PIL import Image
from skimage.filters import threshold_otsu, laplace, gaussian
from skimage.morphology import remove_small_objects, binary_dilation, disk
from skimage.color import rgb2hsv, rgb2gray
from skimage.measure import regionprops, label
from skimage.feature import graycomatrix, graycoprops
from scipy import stats
from sklearn.metrics import roc_curve, auc
import json
from datetime import datetime
import warnings
import matplotlib.pyplot as plt
from pathlib import Path
import timm
import traceback

warnings.filterwarnings("ignore")

# ===============================
# REPRODUCIBILITY
# ===============================
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True

# ===============================
# CONFIGURATION
# ===============================
SVS_DIR = r"C:\Users\Shahinur\Downloads\PKG_Dataset\PKG - Brain-Mets-Lung-MRI-Path-Segs_histopathology images\data"
OUTPUT_DIR = "histology_q1_production_final"
Path(OUTPUT_DIR).mkdir(exist_ok=True)
Path(f"{OUTPUT_DIR}/figures").mkdir(exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("="*80)
print("Q1-READY: LUNIT ATOM + INTERPRETABLE FEATURES (PRODUCTION)")
print("="*80)
print(f"Device: {DEVICE}")
print(f"Seed: {RANDOM_SEED}")
print(f"Output: {OUTPUT_DIR}\n")

def log_msg(m):
    """Thread-safe logging"""
    print(m)
    try:
        with open(f"{OUTPUT_DIR}/progress.log", 'a') as f:
            f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {m}\n")
    except:
        pass

# ===============================
# OPTIMIZER - FIXED & ROBUST
# ===============================
class Optimizer:
    """Production-grade optimizer with robust error handling"""
    
    def __init__(self, slides, n=300):
        self.slides = slides
        self.n = n
        self.results = {}
    
    def _bg(self, t):
        """Check if tile is background"""
        return np.mean(t) > 220
    
    def _blur(self, t):
        """Compute blur score with gradient boost for low contrast"""
        g = rgb2gray(t)
        v = laplace(g).var()
        return v + (np.sqrt(np.gradient(g)[0]**2 + np.gradient(g)[1]**2).mean()*10 if v<10 else 0)
    
    def _mask(self, t):
        """Tissue segmentation mask"""
        g = np.mean(t, 2)
        th = threshold_otsu(g) if g.std()>1 else 200
        m = g < th
        m = remove_small_objects(m, 500)
        m = binary_dilation(m, disk(3))
        return m
    
    def elbow(self, sz, mx=250):
        """Elbow method for optimal tile count"""
        log_msg("METHOD 1: Elbow (Tile Count)")
        cnts, vars = [], []
        
        for p in self.slides[:3]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                ts = []
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(ts)>=mx: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t) and self._mask(t).sum()/t.size>=0.1:
                            ts.append(rgb2gray(t).flatten())
                    if len(ts)>=mx: break
                
                sl.close()
                
                if len(ts)<50: continue
                ta = np.array(ts)
                
                for n in range(25, mx+1, 25):
                    if n>len(ta): continue
                    vars.append(np.var(np.mean(ta[:n], 0)))
                    cnts.append(n)
                    
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(cnts)<3:
            log_msg("  ‚ö†Ô∏è Insufficient data, using default: 100")
            return 100
        
        cnts, vars = np.array(cnts), np.array(vars)
        d2 = np.gradient(np.gradient(vars))
        opt = max(50, min(int(cnts[np.argmin(np.abs(d2))]), 200))
        
        self.results['elbow'] = {'optimal': opt, 'samples': len(cnts)}
        log_msg(f"‚úÖ Optimal tiles: {opt}")
        return opt
    
    def youden(self, sz):
        """Youden's J statistic for blur threshold"""
        log_msg("METHOD 2: Youden's J (Blur)")
        blurs, tisss = [], []
        
        for p in self.slides[:4]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(blurs)>=500: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t):
                            blurs.append(self._blur(t))
                            tisss.append(self._mask(t).sum()/t.size)
                    if len(blurs)>=500: break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(blurs) < 100:
            log_msg("  ‚ö†Ô∏è Insufficient data, using default: 0.1")
            return 0.1
        
        ba, ta = np.array(blurs), np.array(tisss)
        emp, tis = ta<0.05, ta>=0.3
        
        if emp.sum() < 10 or tis.sum() < 10:
            # Fallback to percentile
            opt = float(np.percentile(ba, 5))
            log_msg(f"‚úÖ Blur threshold (percentile): {opt:.4f}")
            return opt
        
        ths = np.percentile(ba, np.arange(1,20,1))
        js = []
        
        for th in ths:
            tp = (ba[emp]<th).sum()
            fn = (ba[tis]>=th).sum()
            fp = (ba[tis]<th).sum()
            tn = (ba[emp]>=th).sum()
            
            sensitivity = tp/(tp+fn+1e-8)
            specificity = tn/(tn+fp+1e-8)
            js.append(sensitivity + specificity - 1)
        
        opt = float(ths[np.argmax(js)])
        self.results['youden'] = {'optimal': opt, 'j': float(max(js))}
        log_msg(f"‚úÖ Blur threshold: {opt:.4f}")
        return opt
    
    def tissue_threshold_robust(self, sz):
        """
        ROBUST tissue threshold using multiple methods
        Q1-ready: No arbitrary defaults, data-driven fallbacks
        """
        log_msg("METHOD 3: Tissue Threshold (Multi-Method)")
        
        tisss = []
        
        # Collect tissue percentages
        for p in self.slides[:5]:  # Use more slides
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(tisss)>=600:  # More samples
                            break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t):
                            tp = self._mask(t).sum()/t.size
                            tisss.append(tp)
                    if len(tisss)>=600:
                        break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(tisss) < 100:
            log_msg("  ‚ùå CRITICAL: Insufficient data for tissue threshold")
            return None
        
        ta = np.array(tisss)
        
        log_msg(f"  üìä Tissue % distribution:")
        log_msg(f"     Samples: {len(ta)}")
        log_msg(f"     Mean: {ta.mean():.3f}, Std: {ta.std():.3f}")
        log_msg(f"     Min: {ta.min():.3f}, Max: {ta.max():.3f}")
        log_msg(f"     P10: {np.percentile(ta, 10):.3f}, P50: {np.percentile(ta, 50):.3f}")
        
        # METHOD A: Percentile-based (conservative)
        # Use 25th percentile - excludes mostly-background tiles
        method_a = float(np.percentile(ta, 25))
        log_msg(f"  Method A (P25): {method_a:.3f}")
        
        # METHOD B: Otsu on tissue distribution
        try:
            # Bin the tissue percentages
            hist, bin_edges = np.histogram(ta, bins=50)
            # Find threshold that separates background-heavy from tissue-rich
            cumsum = np.cumsum(hist)
            total = cumsum[-1]
            
            max_var = 0
            best_th = 0.3
            
            for i in range(1, len(hist)-1):
                w0 = cumsum[i] / total
                w1 = 1 - w0
                
                if w0 == 0 or w1 == 0:
                    continue
                
                m0 = np.average(bin_edges[:i+1], weights=hist[:i+1]) if hist[:i+1].sum() > 0 else 0
                m1 = np.average(bin_edges[i+1:], weights=hist[i+1:]) if hist[i+1:].sum() > 0 else 0
                
                var = w0 * w1 * (m0 - m1)**2
                
                if var > max_var:
                    max_var = var
                    best_th = bin_edges[i]
            
            method_b = float(best_th)
            log_msg(f"  Method B (Otsu): {method_b:.3f}")
            
        except Exception as e:
            log_msg(f"  Method B failed: {e}")
            method_b = method_a
        
        # METHOD C: Gap statistic
        # Find largest gap in sorted tissue percentages
        try:
            sorted_ta = np.sort(ta)
            gaps = np.diff(sorted_ta)
            
            # Find gap in range [0.2, 0.6]
            valid_gaps = []
            for i, gap in enumerate(gaps):
                if 0.2 <= sorted_ta[i] <= 0.6:
                    valid_gaps.append((gap, sorted_ta[i]))
            
            if valid_gaps:
                max_gap = max(valid_gaps, key=lambda x: x[0])
                method_c = float(max_gap[1])
                log_msg(f"  Method C (Gap): {method_c:.3f}")
            else:
                method_c = method_a
                log_msg(f"  Method C (Gap): No gap found, using P25")
                
        except Exception as e:
            log_msg(f"  Method C failed: {e}")
            method_c = method_a
        
        # METHOD D: Mixture model (simple 2-component)
        try:
            # Assume bimodal: background-heavy vs tissue-rich
            # Find local minimum between modes
            hist, bins = np.histogram(ta, bins=30)
            smoothed = np.convolve(hist, np.ones(3)/3, mode='same')
            
            # Find local minima
            minima = []
            for i in range(1, len(smoothed)-1):
                if smoothed[i] < smoothed[i-1] and smoothed[i] < smoothed[i+1]:
                    if 0.2 <= bins[i] <= 0.6:
                        minima.append((smoothed[i], bins[i]))
            
            if minima:
                # Use deepest minimum
                method_d = float(min(minima, key=lambda x: x[0])[1])
                log_msg(f"  Method D (Mixture): {method_d:.3f}")
            else:
                method_d = method_a
                log_msg(f"  Method D (Mixture): No minimum, using P25")
                
        except Exception as e:
            log_msg(f"  Method D failed: {e}")
            method_d = method_a
        
        # CONSENSUS: Use median of methods (robust to outliers)
        methods = [method_a, method_b, method_c, method_d]
        consensus = float(np.median(methods))
        
        # Clamp to reasonable range
        consensus = max(0.25, min(consensus, 0.65))
        
        log_msg(f"\n  üìä Multi-Method Results:")
        log_msg(f"     A (P25): {method_a:.3f}")
        log_msg(f"     B (Otsu): {method_b:.3f}")
        log_msg(f"     C (Gap): {method_c:.3f}")
        log_msg(f"     D (Mixture): {method_d:.3f}")
        log_msg(f"  üéØ Consensus (median): {consensus:.3f}")
        
        self.results['tissue_threshold'] = {
            'optimal': consensus,
            'method_a_p25': method_a,
            'method_b_otsu': method_b,
            'method_c_gap': method_c,
            'method_d_mixture': method_d,
            'samples': len(ta),
            'distribution': {
                'mean': float(ta.mean()),
                'std': float(ta.std()),
                'p10': float(np.percentile(ta, 10)),
                'p25': float(np.percentile(ta, 25)),
                'p50': float(np.percentile(ta, 50)),
                'p75': float(np.percentile(ta, 75))
            }
        }
        
        log_msg(f"‚úÖ Tissue threshold: {consensus:.2f} (robust multi-method)")
        return consensus
    
    def roc(self, sz):
        """DEPRECATED: Use tissue_threshold_robust() instead"""
        log_msg("‚ö†Ô∏è Using robust multi-method tissue threshold instead of ROC")
        return self.tissue_threshold_robust(sz)
    
    def bootstrap(self, sz, n=50):
        """Bootstrap confidence interval for blur threshold"""
        log_msg("METHOD 4: Bootstrap")
        blurs = []
        
        for p in self.slides[:2]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(blurs)>=200: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t):
                            blurs.append(self._blur(t))
                    if len(blurs)>=200: break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(blurs) < 50:
            log_msg("  ‚ö†Ô∏è Insufficient data for bootstrap")
            return 0.1, 0.0
        
        ba = np.array(blurs)
        bs = [np.percentile(np.random.choice(ba, len(ba), True), 5) for _ in range(n)]
        mu, std = np.mean(bs), np.std(bs)
        
        self.results['bootstrap'] = {'mean': float(mu), 'std': float(std)}
        log_msg(f"‚úÖ Bootstrap: {mu:.4f}¬±{std:.4f}")
        return mu, std
    
    def entropy(self, sz):
        """Compute stain normalization targets"""
        log_msg("METHOD 5: Entropy (Stain)")
        tiles = []
        
        for p in self.slides[:3]:
            try:
                sl = openslide.OpenSlide(p)
                lv = sl.get_best_level_for_downsample(1)
                ds = sl.level_downsamples[lv]
                w, h = sl.level_dimensions[lv]
                
                for y in range(0, h-sz, sz):
                    for x in range(0, w-sz, sz):
                        if len(tiles)>=200: break
                        t = np.array(sl.read_region((int(x*ds), int(y*ds)), lv, (sz,sz)).convert("RGB"))
                        if not self._bg(t) and self._mask(t).sum()/t.size>=0.3:
                            tiles.append(t.astype(np.float32)/255)
                    if len(tiles)>=200: break
                
                sl.close()
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Slide error: {e}")
                continue
        
        if len(tiles) < 20:
            log_msg("  ‚ö†Ô∏è Insufficient data, using defaults")
            m, s = np.array([0.75, 0.55, 0.45]), np.array([0.15, 0.15, 0.15])
        else:
            ms = [t.mean((0,1)) for t in tiles]
            ss = [t.std((0,1)) for t in tiles]
            m, s = np.mean(ms,0), np.mean(ss,0)
        
        self.results['entropy'] = {'means': m.tolist(), 'stds': s.tolist()}
        log_msg(f"‚úÖ Stain: means={m.round(3)}")
        return m, s
    
    def save(self):
        """Save optimization results"""
        try:
            with open(f"{OUTPUT_DIR}/optimization.json", 'w') as f:
                json.dump({
                    'timestamp': datetime.now().isoformat(),
                    'seed': RANDOM_SEED,
                    **self.results
                }, f, indent=2)
            log_msg(f"‚úÖ Optimization results saved\n")
        except Exception as e:
            log_msg(f"‚ö†Ô∏è Could not save optimization: {e}")

# ===============================
# INTERPRETABLE FEATURES
# ===============================
class InterpExtractor:
    """Extract interpretable histology features"""
    
    def nuclear(self, t):
        """Nuclear morphology features"""
        g = rgb2gray(t)
        try:
            b = g < threshold_otsu(g)*0.8
        except:
            b = g < 100
        
        l = label(b)
        r = regionprops(l)
        
        if not r:
            return {f'nuc_{k}':0 for k in ['cnt','area_m','area_s','dens','circ','sol']}
        
        a = np.array([x.area for x in r])
        c = np.array([4*np.pi*x.area/(x.perimeter**2+1e-8) for x in r])
        s = np.array([x.solidity for x in r])
        
        return {
            'nuc_cnt': len(r),
            'nuc_area_m': a.mean(),
            'nuc_area_s': a.std(),
            'nuc_dens': len(r)/b.size,
            'nuc_circ': c.mean(),
            'nuc_sol': s.mean()
        }
    
    def arch(self, t):
        """Architectural features (organization uniformity)"""
        g = rgb2gray(t)
        sm = gaussian(g, 5)
        
        # Compute local variance
        vs = [np.var(g[i:i+20,j:j+20]) 
              for i in range(0,g.shape[0]-20,20) 
              for j in range(0,g.shape[1]-20,20)]
        
        return {
            'arch_org': np.mean(vs) if vs else 0,
            'arch_uni': np.std(vs) if vs else 0
        }
    
    def texture(self, t):
        """Texture features via GLCM"""
        g = (rgb2gray(t)*255).astype(np.uint8)
        
        try:
            glcm = graycomatrix(g, [1], [0], 256, symmetric=True, normed=True)
            f = {}
            for p in ['contrast','homogeneity','energy']:
                f[f'tex_{p}'] = float(graycoprops(glcm, p)[0,0])
        except:
            f = {f'tex_{p}':0 for p in ['contrast','homogeneity','energy']}
        
        return f
    
    def extract(self, t):
        """Extract all interpretable features from tile"""
        try:
            return {**self.nuclear(t), **self.arch(t), **self.texture(t)}
        except Exception as e:
            # Return zeros on error
            return {f'nuc_{k}':0 for k in ['cnt','area_m','area_s','dens','circ','sol']} | \
                   {'arch_org':0, 'arch_uni':0} | \
                   {f'tex_{p}':0 for p in ['contrast','homogeneity','energy']}

# ===============================
# ATOM EXTRACTOR (RESNET50)
# ===============================
class ATOMExtractor:
    """LUNIT ATOM-style feature extractor using ResNet50"""
    
    def __init__(self):
        log_msg("Loading ATOM (ResNet-50)...")
        try:
            self.model = timm.create_model(
                'resnet50',
                pretrained=True,
                num_classes=0,
                global_pool='avg'
            ).to(DEVICE).eval()
            log_msg("‚úÖ ATOM loaded (2048D)\n")
        except Exception as e:
            log_msg(f"‚ùå ATOM loading failed: {e}")
            raise
        
        self.tf = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
        ])
    
    def extract(self, tiles, sz=224):
        """Extract ATOM features from tiles"""
        if not tiles:
            return None
        
        fs = []
        log_msg(f"  Extracting ATOM features from {len(tiles)} tiles...")
        
        for i, t in enumerate(tiles):
            try:
                # Resize if needed
                if t.shape[0]!=sz or t.shape[1]!=sz:
                    t = np.array(Image.fromarray(t).resize((sz,sz)))
                
                x = self.tf(Image.fromarray(t)).unsqueeze(0).to(DEVICE)
                
                with torch.no_grad():
                    fs.append(self.model(x).squeeze().cpu().numpy())
                
                if (i+1)%50==0:
                    print(f"    {i+1}/{len(tiles)}", end='\r')
                    
            except Exception as e:
                continue
        
        if not fs:
            log_msg(f"  ‚ùå No features extracted")
            return None
        
        fs = np.array(fs)
        log_msg(f"  ‚úì Extracted {len(fs)} tile features")
        
        # FIXED: Robust outlier removal
        if len(fs) > 10:  # Only remove outliers if we have enough tiles
            try:
                # Compute z-scores
                mean_feat = fs.mean(0)
                std_feat = fs.std(0)
                
                # Avoid division by zero for constant features
                std_feat = np.where(std_feat < 1e-6, 1.0, std_feat)
                
                z = np.abs((fs - mean_feat) / std_feat)
                
                # More lenient threshold (5 instead of 3)
                # AND require multiple features to be outliers (not just 1)
                outlier_mask = (z > 5).sum(axis=1) > (z.shape[1] * 0.1)  # >10% features are outliers
                
                num_outliers = outlier_mask.sum()
                
                if num_outliers > 0 and num_outliers < len(fs) * 0.5:  # Don't remove >50%
                    fs = fs[~outlier_mask]
                    log_msg(f"  üîç Removed {num_outliers} outlier tiles")
                elif num_outliers >= len(fs) * 0.5:
                    log_msg(f"  ‚ö†Ô∏è Too many outliers ({num_outliers}), keeping all tiles")
                
            except Exception as e:
                log_msg(f"  ‚ö†Ô∏è Outlier detection failed: {e}, keeping all tiles")
        else:
            log_msg(f"  ‚ö†Ô∏è Too few tiles for outlier removal, keeping all")
        
        if len(fs) == 0:
            log_msg(f"  ‚ùå All tiles removed as outliers")
            return None
        
        log_msg(f"  ‚úÖ Final: {len(fs)} tiles")
        
        # Aggregate features (with safety checks)
        try:
            return {
                'atom_m': fs.mean(0),
                'atom_s': fs.std(0),
                'atom_mx': fs.max(0) if len(fs) > 0 else np.zeros(fs.shape[1]),
                'atom_mn': fs.min(0) if len(fs) > 0 else np.zeros(fs.shape[1]),
                'atom_md': np.median(fs, 0) if len(fs) > 0 else np.zeros(fs.shape[1])
            }
        except Exception as e:
            log_msg(f"  ‚ùå Aggregation error: {e}")
            return None

# ===============================
# MAIN PIPELINE
# ===============================
def main():
    files = [f for f in os.listdir(SVS_DIR) if f.lower().endswith('.svs')]
    
    if len(files) < 10:
        log_msg("‚ùå Need ‚â•10 slides for calibration")
        return
    
    log_msg(f"Found {len(files)} SVS files")
    
    # CALIBRATION: Use random 10 slides
    np.random.shuffle(files)
    cal_paths = [os.path.join(SVS_DIR, f) for f in files[:10]]
    
    # PROCESSING: Use ALL slides (NO DATA LOSS)
    proc_files = files
    
    log_msg(f"\n{'='*80}")
    log_msg("STEP 1: OPTIMIZATION (CALIBRATION)")
    log_msg(f"{'='*80}")
    log_msg(f"Calibration slides: {len(cal_paths)}")
    log_msg(f"Processing slides: {len(proc_files)} (ALL - no data loss)\n")
    
    # Run optimization
    opt = Optimizer(cal_paths, 300)
    sz = 224
    n_tiles = opt.elbow(sz)
    blur_th = opt.youden(sz)
    tiss_th = opt.roc(sz)
    boot_m, boot_s = opt.bootstrap(sz)
    stain_m, stain_s = opt.entropy(sz)
    opt.save()
    
    # Save parameters
    params = {
        'tile_sz': sz,
        'n_tiles': n_tiles,
        'blur_th': blur_th,
        'tiss_th': tiss_th,
        'stain_m': stain_m.tolist(),
        'stain_s': stain_s.tolist(),
        'seed': RANDOM_SEED,
        'calibration_slides': 10,
        'processing_slides': len(proc_files)
    }
    
    with open(f"{OUTPUT_DIR}/params.json", 'w') as f:
        json.dump(params, f, indent=2)
    
    log_msg(f"\n{'='*80}")
    log_msg("STEP 2: FEATURE EXTRACTION")
    log_msg(f"{'='*80}\n")
    
    # Initialize extractors
    interp = InterpExtractor()
    
    try:
        atom = ATOMExtractor()
    except:
        log_msg("‚ö†Ô∏è ATOM loading failed, continuing with interpretable features only")
        atom = None
    
    # Storage
    interp_res, atom_res, qc = [], [], []
    
    # Process all slides
    for i, fn in enumerate(proc_files, 1):
        log_msg(f"\n[{i}/{len(proc_files)}] {fn}")
        
        try:
            sl = openslide.OpenSlide(os.path.join(SVS_DIR, fn))
            lv = sl.get_best_level_for_downsample(1)
            ds = sl.level_downsamples[lv]
            w, h = sl.level_dimensions[lv]
            
            tiles = []
            
            # Extract tiles
            for y in range(0, h-sz, sz):
                for x in range(0, w-sz, sz):
                    if len(tiles)>=n_tiles:
                        break
                    
                    t = np.array(sl.read_region(
                        (int(x*ds), int(y*ds)),
                        lv,
                        (sz,sz)
                    ).convert("RGB"))
                    
                    # QC checks
                    if np.mean(t)>220:  # Background
                        continue
                    
                    g = rgb2gray(t)
                    m = g < threshold_otsu(g) if g.std()>1 else g<200
                    
                    if m.sum()/m.size < tiss_th:  # Tissue percentage
                        continue
                    
                    if opt._blur(t) < blur_th:  # Blur
                        continue
                    
                    tiles.append(t)
                
                if len(tiles)>=n_tiles:
                    break
            
            sl.close()
            
            # Check minimum tiles
            if len(tiles) < n_tiles//2:
                log_msg(f"  ‚ùå Insufficient tiles: {len(tiles)}")
                qc.append({'slide': fn, 'status': 'fail', 'reason': 'insufficient_tiles', 'tiles': len(tiles)})
                continue
            
            # Extract interpretable features
            ifs = [interp.extract(t) for t in tiles]
            idf = pd.DataFrame(ifs)
            
            iagg = {'slide': fn}
            for c in idf.columns:
                iagg[f'{c}_m'] = idf[c].mean()
                iagg[f'{c}_s'] = idf[c].std()
            
            interp_res.append(iagg)
            
            # Extract ATOM features
            if atom:
                try:
                    af = atom.extract(tiles, sz)
                    if af:
                        aagg = {'slide': fn}
                        for k, v in af.items():
                            for j, x in enumerate(v):
                                aagg[f'{k}_{j}'] = float(x)
                        atom_res.append(aagg)
                    else:
                        log_msg(f"  ‚ö†Ô∏è ATOM extraction returned None")
                except Exception as e:
                    log_msg(f"  ‚ö†Ô∏è ATOM extraction failed: {e}")
                    traceback.print_exc()
            
            log_msg(f"  ‚úÖ Success: {len(tiles)} tiles")
            qc.append({'slide': fn, 'status': 'ok', 'tiles': len(tiles)})
            
            # Periodic save
            if i%10==0:
                pd.DataFrame(interp_res).to_csv(f"{OUTPUT_DIR}/interpretable.csv", index=False)
                if atom_res:
                    pd.DataFrame(atom_res).to_csv(f"{OUTPUT_DIR}/atom.csv", index=False)
                pd.DataFrame(qc).to_csv(f"{OUTPUT_DIR}/qc.csv", index=False)
                log_msg(f"  üíæ Checkpoint saved ({i} slides processed)")
        
        except Exception as e:
            log_msg(f"  ‚ùå Error: {e}")
            traceback.print_exc()
            qc.append({'slide': fn, 'status': 'fail', 'reason': str(e), 'tiles': 0})
    
    # Final save
    log_msg(f"\n{'='*80}")
    log_msg("FINAL SAVE")
    log_msg(f"{'='*80}")
    
    if interp_res:
        pd.DataFrame(interp_res).to_csv(f"{OUTPUT_DIR}/interpretable.csv", index=False)
        log_msg(f"‚úÖ Interpretable features: {len(interp_res)} slides")
    
    if atom_res:
        pd.DataFrame(atom_res).to_csv(f"{OUTPUT_DIR}/atom.csv", index=False)
        log_msg(f"‚úÖ ATOM features: {len(atom_res)} slides")
    
    pd.DataFrame(qc).to_csv(f"{OUTPUT_DIR}/qc.csv", index=False)
    log_msg(f"‚úÖ QC report saved")
    
    # Summary
    qc_df = pd.DataFrame(qc)
    success = (qc_df['status']=='ok').sum()
    failed = (qc_df['status']=='fail').sum()
    
    log_msg(f"\n{'='*80}")
    log_msg("PIPELINE COMPLETED")
    log_msg(f"{'='*80}")
    log_msg(f"‚úÖ Successful: {success}/{len(qc_df)} ({success/len(qc_df)*100:.1f}%)")
    log_msg(f"‚ùå Failed: {failed}/{len(qc_df)}")
    log_msg(f"\nOutput files:")
    log_msg(f"  - {OUTPUT_DIR}/interpretable.csv")
    log_msg(f"  - {OUTPUT_DIR}/atom.csv")
    log_msg(f"  - {OUTPUT_DIR}/qc.csv")
    log_msg(f"  - {OUTPUT_DIR}/params.json")
    log_msg(f"  - {OUTPUT_DIR}/optimization.json")

if __name__ == "__main__":
    main()

I'll replace the ResNet-50 loading with CTransPath, loading the weights from the weights folder. The rest of the code remains unchanged. 

Using "Replace String in File"

Let me be more specific with additional context: 

Using "Replace String in File"

Read [](file:///d%3A/paper/new.ipynb#1360-1360), lines 1360 to 1385

Read [](file:///d%3A/paper/new.ipynb#1200-1200), lines 1200 to 1235

Read [](file:///d%3A/paper/new.ipynb#1130-1130), lines 1130 to 1160

Now I'll replace the ResNet-50 section with CTransPath: 

Using "Replace String in File"

Let me search for all occurrences to be more precise: 

Searched for regex `ATOM EXTRACTOR.*RESNET50` (`**/*.ipynb`), 1 result

Let me use edit_notebook_file instead to modify the cell directly. First, let me get the exact cell: 

Using "Replace String in File"

There appear to be multiple notebook cells. Let me check how many cells have this pattern and modify the last one (cell 4): 

Read [](file:///d%3A/paper/new.ipynb#1130-1130), lines 1130 to 1180

Now I'll use a more specific match including unique following code: 

Using "Replace String in File"

Let me check the full notebook structure to understand where these duplicates are: 

Searched for regex `class ATOMExtractor` (`**/d:\\paper\\new.ipynb`), no results

Searched for regex `ATOM EXTRACTOR`, 1 result

Good, there's only one. The issue is likely that the notebook has the same code in cell output. Let me be even more specific with additional unique context around it: 

Read [](file:///d%3A/paper/new.ipynb#1115-1115), lines 1115 to 1125