## Quick Start (in Colab)

1. Runtime → Change runtime type → GPU (prefer T4).
2. Run Section 2 to install packages (first time only).
3. In Section 3, upload kaggle.json and set `KAGGLE_DATASET`.
4. In Section 4, set `CSV_PATH`, `IMAGE_DIR`, `IMAGE_COL`, and confirm `TARGET_COL`.
5. Run through Sections 5–8 to split and (optionally) augment to ~1000 samples.
6. Optionally annotate/correct labels in Section 9 and re-run splits if changed.
7. Inspect features (Section 10) and then train the CNN (Sections 11–13).
8. Review metrics and visuals (Section 14) and Grad-CAM (Section 15).
9. Optionally enable K-Fold (Section 16) for robustness.
10. Evaluate on test + export model and predictions (Sections 17–18).

# GSM Microscopy Pipeline (Colab-Ready)

This notebook downloads a Kaggle microscopy dataset, scales a small set (~130) of images to ~1000 via augmentation, supports optional label annotation, extracts features, and trains a strong GSM regressor (and optional classifier) on T4 GPU. It includes rich visualizations for augmentations, embeddings, and model explanations (Grad-CAM).

In [None]:
# 1) Check GPU and Set Reproducibility
import os, random, math
import numpy as np
import torch

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

set_seed(42)

if torch.cuda.is_available():
    device = torch.device('cuda')
    gpu_name = torch.cuda.get_device_name(0)
    print(f"CUDA available: {gpu_name}")
    # Prefer mixed precision on T4
    from torch.cuda.amp import autocast, GradScaler
    scaler = GradScaler()
else:
    device = torch.device('cpu')
    print("CUDA not available. Running on CPU.")

In [None]:
# 2) Install and Import Dependencies
# If running in Colab, uncomment the next cell to install packages
# Note: In Colab, this may require a runtime restart after installation.

IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    !pip -q install kaggle albumentations timm pyyaml umap-learn opencv-python-headless kagglehub scikit-image

import os
import json
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.decomposition import PCA

import albumentations as A
from albumentations.pytorch import ToTensorV2

import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Optional: KaggleHub for dataset download
try:
    import kagglehub
    HAVE_KAGGLEHUB = True
except Exception:
    HAVE_KAGGLEHUB = False

# Optional: skimage for GLCM/LBP
try:
    from skimage.feature import greycomatrix, greycoprops, local_binary_pattern
    HAVE_SKIMAGE = True
except Exception:
    HAVE_SKIMAGE = False

print("torch:", torch.__version__, "sklearn:", sklearn.__version__, "timm:", timm.__version__)

In [None]:
# 2b) Training Configuration (Backbone/Size/Epochs)
# Choose your model and training scale here.
# Examples: 'efficientnet_b3', 'convnext_tiny', 'resnet50', 'efficientnet_b0'
BACKBONE = 'efficientnet_b3'    # or 'convnext_tiny'
IMG_SIZE = 352                   # try 320–380 on T4; increase reduces batch size
EPOCHS = 50                      # try 40–60 for stronger convergence
PATIENCE = 7                     # early stopping patience (epochs without val MAE improvement)

# Optional manual batch size override (None -> auto based on IMG_SIZE)
BATCH_SIZE_OVERRIDE = None
print(f"Config -> BACKBONE={BACKBONE}, IMG_SIZE={IMG_SIZE}, EPOCHS={EPOCHS}, PATIENCE={PATIENCE}")

In [None]:
# 3) Download Dataset from Kaggle
# Instructions (Colab):
# 1) Create kaggle.json at /content/kaggle.json or upload via Files UI.
# 2) Set KAGGLE_DATASET below (e.g., "owner/dataset-slug").
# 3) Run this cell to download and unzip to /content/data.

KAGGLE_DATASET = "owner/dataset-slug"  # TODO: replace with your Kaggle dataset slug
BASE_DIR = "/content" if IN_COLAB else os.getcwd()
DATA_ROOT = os.path.join(BASE_DIR, "data")
os.makedirs(DATA_ROOT, exist_ok=True)

if IN_COLAB:
    import shutil
    # Ensure Kaggle API key
    if os.path.exists("/content/kaggle.json"):
        os.makedirs(os.path.join(os.path.expanduser("~"), ".kaggle"), exist_ok=True)
        shutil.copy("/content/kaggle.json", os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json"))
        os.chmod(os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json"), 0o600)
    else:
        print("Upload kaggle.json to /content/kaggle.json or set it manually.")

    if KAGGLE_DATASET != "owner/dataset-slug":
        !kaggle datasets download -d $KAGGLE_DATASET -p $DATA_ROOT -q
        !unzip -oq "$DATA_ROOT/$(basename $KAGGLE_DATASET).zip" -d $DATA_ROOT
        print("Downloaded into:", DATA_ROOT)
    else:
        print("Please set KAGGLE_DATASET to the correct 'owner/dataset' slug.")
else:
    print("Not in Colab. Ensure your data exists under:", DATA_ROOT)

# Quick check
if os.path.exists(DATA_ROOT):
    all_files = []
    for root, _, files in os.walk(DATA_ROOT):
        for f in files:
            all_files.append(os.path.join(root, f))
    img_exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
    imgs = [f for f in all_files if os.path.splitext(f)[1].lower() in img_exts]
    print(f"Found {len(all_files)} files; {len(imgs)} images.")

# 3b) Alternative: Download via KaggleHub (FabricNet)
# If you prefer not to set kaggle.json, use KaggleHub below.
if HAVE_KAGGLEHUB:
    try:
        hub_path = kagglehub.dataset_download("acseckn/fabricnet")
        print("KaggleHub path:", hub_path)
        DATA_ROOT = hub_path
        # Quick check for images
        all_files = []
        for root, _, files in os.walk(DATA_ROOT):
            for f in files:
                all_files.append(os.path.join(root, f))
        img_exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
        imgs = [f for f in all_files if os.path.splitext(f)[1].lower() in img_exts]
        print(f"(KaggleHub) Found {len(all_files)} files; {len(imgs)} images.")
    except Exception as e:
        print("KaggleHub download failed:", e)
else:
    print("KaggleHub not available. Try: pip install kagglehub (on Colab it's auto-installed above)")

In [None]:
# 4) Load CSV and Resolve GSM Target
# Configure your CSV and column names here; if previous cell detected them, we reuse.

if 'CSV_PATH' not in globals() or CSV_PATH is None:
    CSV_PATH = os.path.join(DATA_ROOT, "metadata.csv")  # TODO: adjust filename
if 'IMAGE_DIR' not in globals() or IMAGE_DIR is None:
    IMAGE_DIR = os.path.join(DATA_ROOT, "images")       # TODO: adjust folder
if 'IMAGE_COL' not in globals() or IMAGE_COL is None:
    IMAGE_COL = "filename"                               # column in CSV pointing to image file
if 'TARGET_COL' not in globals() or TARGET_COL is None:
    TARGET_COL = "gsm"                                   # preferred target column name after cleaning
if 'ALT_MASS_COL' not in globals() or ALT_MASS_COL is None:
    ALT_MASS_COL = "mass"                                # alternative if dataset uses different name
if 'AREA_COL' not in globals() or AREA_COL is None:
    AREA_COL = "area_m2"                                 # optional area column if available

assert os.path.exists(DATA_ROOT), f"Data root not found: {DATA_ROOT}"
assert os.path.exists(CSV_PATH), f"CSV not found: {CSV_PATH}"

df = pd.read_csv(CSV_PATH)
print("Columns:", df.columns.tolist())

# Normalize image path

def resolve_path(row):
    p = str(row[IMAGE_COL])
    if os.path.isabs(p):
        return p
    # If IMAGE_DIR is detected, join with it
    base_dir = IMAGE_DIR if os.path.exists(IMAGE_DIR) else os.path.dirname(CSV_PATH)
    return os.path.join(base_dir, p)

# Resolve/derive GSM
if TARGET_COL not in df.columns:
    # Try to derive GSM from mass/area
    if ALT_MASS_COL in df.columns and AREA_COL in df.columns:
        # Convert to GSM = mass [g] / area [m^2]
        df[TARGET_COL] = df[ALT_MASS_COL] / (df[AREA_COL].replace(0, np.nan))
        print(f"Derived '{TARGET_COL}' from '{ALT_MASS_COL}' and '{AREA_COL}'.")
    elif ALT_MASS_COL in df.columns:
        print(f"Found '{ALT_MASS_COL}'. If units differ, convert to GSM using GSM=mass/area.")
        # Create placeholder; user may annotate missing values
        df[TARGET_COL] = df[ALT_MASS_COL]
    else:
        raise ValueError("No GSM or mass column found. Please set TARGET_COL/ALT_MASS_COL correctly.")

# Basic unit hints
print("Unit hints: 1 mg/cm^2 = 10 g/m^2; 1 g/m^2 = 0.1 mg/cm^2")

# Clean and ensure valid rows

df["image_path"] = df.apply(resolve_path, axis=1)
df = df[df["image_path"].apply(os.path.exists)].copy()
df = df[np.isfinite(df[TARGET_COL])].copy()
print("Rows after cleanup:", len(df))

# Persist cleaned CSV
CLEAN_CSV = os.path.join(DATA_ROOT, "metadata_clean.csv")
df.to_csv(CLEAN_CSV, index=False)
print("Saved:", CLEAN_CSV)

In [None]:
# 5) Train/Val/Test Split with Stratified Target Bins
N_BINS = 5
TEST_SIZE = 0.15
VAL_SIZE = 0.15
RANDOM_STATE = 42

# Create bins for stratification on continuous target
bins = pd.qcut(df[TARGET_COL], q=N_BINS, labels=False, duplicates='drop')
X_train, X_temp, y_train, y_temp, b_train, b_temp = train_test_split(
    df, df[TARGET_COL], bins, test_size=(TEST_SIZE + VAL_SIZE), random_state=RANDOM_STATE, stratify=bins
)

# Split temp into val/test
val_ratio = VAL_SIZE / (TEST_SIZE + VAL_SIZE)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, X_temp[TARGET_COL], test_size=(1 - val_ratio), random_state=RANDOM_STATE,
    stratify=pd.qcut(X_temp[TARGET_COL], q=max(2, N_BINS//2), labels=False, duplicates='drop')
)

SPLIT_DIR = os.path.join(DATA_ROOT, "splits")
os.makedirs(SPLIT_DIR, exist_ok=True)
X_train.to_csv(os.path.join(SPLIT_DIR, "train.csv"), index=False)
X_val.to_csv(os.path.join(SPLIT_DIR, "val.csv"), index=False)
X_test.to_csv(os.path.join(SPLIT_DIR, "test.csv"), index=False)

print(len(X_train), len(X_val), len(X_test))

In [None]:
# 6) Preview Images and GSM Distribution
sns.set_style("whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df[TARGET_COL], kde=True, ax=axes[0])
axes[0].set_title("GSM Distribution")
axes[0].set_xlabel("GSM (g/m^2)")

sample_paths = X_train["image_path"].sample(min(9, len(X_train)), random_state=42).tolist()
cols = 3
rows = math.ceil(len(sample_paths)/cols)
fig2, axarr = plt.subplots(rows, cols, figsize=(12, 4*rows))
axarr = axarr.flatten() if isinstance(axarr, np.ndarray) else [axarr]
for i, p in enumerate(sample_paths):
    img = cv2.imread(p)
    if img is None: continue
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    axarr[i].imshow(img)
    gsm_val = float(df.loc[df["image_path"]==p, TARGET_COL].values[0])
    axarr[i].set_title(f"{os.path.basename(p)}\nGSM={gsm_val:.2f}")
    axarr[i].axis('off')
for j in range(i+1, len(axarr)):
    axarr[j].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# 7) Define Augmentation Pipeline (Albumentations) and Visualize
IMG_SIZE = globals().get('IMG_SIZE', 256)

train_aug = A.Compose([
    A.RandomResizedCrop(IMG_SIZE, IMG_SIZE, scale=(0.8, 1.0), ratio=(0.9, 1.1), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.3),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.2, rotate_limit=20, p=0.7),
    A.OneOf([
        A.MotionBlur(p=0.2),
        A.MedianBlur(blur_limit=3, p=0.1),
        A.GaussianBlur(blur_limit=3, p=0.1),
    ], p=0.3),
    A.OneOf([
        A.CLAHE(clip_limit=2.0, p=0.8),
        A.RandomBrightnessContrast(p=0.8),
        A.HueSaturationValue(p=0.8)
    ], p=0.7),
    A.Cutout(num_holes=4, max_h_size=IMG_SIZE//10, max_w_size=IMG_SIZE//10, p=0.3),
])

val_tfms = A.Compose([
    A.Resize(IMG_SIZE, IMG_SIZE)
])


def visualize_augments(image_path: str, n: int = 8):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    fig, axes = plt.subplots(1, n, figsize=(3*n, 3))
    for i in range(n):
        aug = train_aug(image=img)
        axes[i].imshow(aug["image"]) 
        axes[i].axis('off')
    plt.suptitle("Augmented variants")
    plt.show()

if len(sample_paths) > 0:
    visualize_augments(sample_paths[0], n=6)

In [None]:
# 8) Augment to ~1000 Samples with On-Disk Cache
TARGET_SIZE = 1000
AUG_DIR = os.path.join(DATA_ROOT, "augmented")
os.makedirs(AUG_DIR, exist_ok=True)

train_df = X_train.copy().reset_index(drop=True)
cur_n = len(train_df)
print(f"Current train size: {cur_n}")

if cur_n < TARGET_SIZE:
    need = TARGET_SIZE - cur_n
    # Oversample rows
    indices = np.random.choice(train_df.index, size=need, replace=True)
    new_rows = []
    for idx in tqdm(indices, desc="Augmenting"):
        row = train_df.loc[idx]
        src = row["image_path"]
        img = cv2.imread(src)
        if img is None: continue
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        aug = train_aug(image=img)["image"]
        # Save
        base = os.path.splitext(os.path.basename(src))[0]
        out_name = f"{base}_aug_{np.random.randint(1e9)}.jpg"
        out_path = os.path.join(AUG_DIR, out_name)
        cv2.imwrite(out_path, cv2.cvtColor(aug, cv2.COLOR_RGB2BGR), [int(cv2.IMWRITE_JPEG_QUALITY), 95])
        r = row.copy()
        r["image_path"] = out_path
        new_rows.append(r)
    if new_rows:
        train_df = pd.concat([train_df, pd.DataFrame(new_rows)], ignore_index=True)

print("Augmented train size:", len(train_df))

AUG_CSV = os.path.join(DATA_ROOT, "train_augmented.csv")
train_df.to_csv(AUG_CSV, index=False)
print("Saved:", AUG_CSV)

In [None]:
# 9) Optional Label Annotation/Correction Widget
from ipywidgets import HBox, VBox, Button, FloatText, IntText, Label, Output
from IPython.display import display, clear_output

annot_df = df.copy().reset_index(drop=True)
row_idx = IntText(value=0, description='Index:')
label_box = FloatText(value=float(annot_df.loc[0, TARGET_COL]), description='GSM:')
status = Label(value='')
out = Output()

btn_prev = Button(description='Prev', button_style='')
btn_next = Button(description='Next', button_style='')
btn_save = Button(description='Save', button_style='success')


def show_row(i):
    i = int(np.clip(i, 0, len(annot_df)-1))
    row_idx.value = i
    label_box.value = float(annot_df.loc[i, TARGET_COL]) if np.isfinite(annot_df.loc[i, TARGET_COL]) else 0.0
    img = cv2.imread(annot_df.loc[i, "image_path"])
    if img is not None:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        with out:
            clear_output(wait=True)
            plt.figure(figsize=(5,5))
            plt.imshow(img)
            plt.title(f"{os.path.basename(annot_df.loc[i, 'image_path'])}\nCurrent GSM={annot_df.loc[i, TARGET_COL]}")
            plt.axis('off')
            plt.show()


def on_prev(_):
    show_row(row_idx.value - 1)

def on_next(_):
    show_row(row_idx.value + 1)

def on_save(_):
    i = row_idx.value
    annot_df.loc[i, TARGET_COL] = float(label_box.value)
    status.value = f"Saved row {i}"
    # persist a versioned CSV
    out_csv = os.path.join(DATA_ROOT, "metadata_annotated.csv")
    annot_df.to_csv(out_csv, index=False)
    print("Wrote:", out_csv)
    show_row(i)

btn_prev.on_click(on_prev)
btn_next.on_click(on_next)
btn_save.on_click(on_save)

ui = VBox([
    HBox([row_idx, label_box, btn_prev, btn_next, btn_save]),
    status,
    out
])

show_row(0)
display(ui)

In [None]:
# 10) Handcrafted Features and UMAP Visualization
import umap

# Existing simple features (color moments, histograms, Laplacian/Sobel)
def compute_features(img_path):
    img = cv2.imread(img_path)
    if img is None:
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_res = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    feats = []
    # Color moments per channel (mean, std, skew)
    for c in range(3):
        ch = img_res[..., c].astype(np.float32)
        mu = ch.mean(); sd = ch.std() + 1e-6
        skew = (((ch - mu)/sd)**3).mean()
        feats.extend([mu, sd, skew])
    # Histogram features
    for c in range(3):
        hist = cv2.calcHist([img_res],[c],None,[32],[0,256]).flatten()
        hist = hist / (hist.sum() + 1e-6)
        feats.extend(hist.tolist())
    # Texture: Laplacian variance and Sobel energy
    gray = cv2.cvtColor(img_res, cv2.COLOR_RGB2GRAY)
    lap = cv2.Laplacian(gray, cv2.CV_32F)
    feats.append(lap.var())
    sobx = cv2.Sobel(gray, cv2.CV_32F, 1, 0)
    soby = cv2.Sobel(gray, cv2.CV_32F, 0, 1)
    feats.append((np.abs(sobx).mean() + np.abs(soby).mean()))
    return np.array(feats, dtype=np.float32)

# New: GLCM + LBP features (if skimage is available)
def compute_features_glcm_lbp(img_path, distances=(1,2,4), angles=(0, np.pi/4, np.pi/2, 3*np.pi/4)):
    if not HAVE_SKIMAGE:
        return None
    img = cv2.imread(img_path)
    if img is None:
        return None
    gray = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_BGR2RGB), cv2.COLOR_RGB2GRAY)
    gray = cv2.resize(gray, (IMG_SIZE, IMG_SIZE))
    # Quantize to 8-bit levels for GLCM
    gray_q = (gray / 4).astype(np.uint8)  # reduce levels to 0..63 to limit matrix size
    glcm = greycomatrix(gray_q, distances=distances, angles=angles, levels=64, symmetric=True, normed=True)
    props = ['contrast','dissimilarity','homogeneity','energy','correlation','ASM']
    glcm_feats = []
    for p in props:
        glcm_feats.extend(greycoprops(glcm, p).ravel().tolist())
    # LBP histogram
    P, R = 8, 1
    lbp = local_binary_pattern(gray, P, R, method='uniform')
    n_bins = P + 2
    (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, n_bins + 1), range=(0, n_bins))
    hist = hist.astype('float32'); hist /= (hist.sum() + 1e-6)
    return np.concatenate([np.array(glcm_feats, dtype=np.float32), hist], axis=0)

# Visualize simple features with UMAP
subset = df.sample(min(300, len(df)), random_state=42).reset_index(drop=True)
X_feats = []
for p in tqdm(subset["image_path"], desc="Features-simple"):
    f = compute_features(p)
    if f is not None:
        X_feats.append(f)
X_feats = np.vstack(X_feats)
y_feats = subset[TARGET_COL].values[:len(X_feats)]

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
emb = reducer.fit_transform(X_feats)

plt.figure(figsize=(6,5))
sc = plt.scatter(emb[:,0], emb[:,1], c=y_feats, cmap='viridis', s=12)
plt.title('UMAP of Simple Handcrafted Features (colored by GSM)')
plt.colorbar(sc, label='GSM')
plt.show()

# Visualize GLCM+LBP features (if available)
if HAVE_SKIMAGE:
    X_feats2 = []
    for p in tqdm(subset["image_path"], desc="Features-GLCM/LBP"):
        f = compute_features_glcm_lbp(p)
        if f is not None:
            X_feats2.append(f)
    if len(X_feats2) > 10:
        X_feats2 = np.vstack(X_feats2)
        emb2 = reducer.fit_transform(X_feats2)
        plt.figure(figsize=(6,5))
        sc2 = plt.scatter(emb2[:,0], emb2[:,1], c=y_feats[:len(X_feats2)], cmap='plasma', s=12)
        plt.title('UMAP of GLCM + LBP Features (colored by GSM)')
        plt.colorbar(sc2, label='GSM')
        plt.show()
    else:
        print("GLCM/LBP features not computed (insufficient images or skimage missing).")
else:
    print("scikit-image not available: skipping GLCM/LBP features. You can enable by installing scikit-image.")

In [None]:
# 11) PyTorch Dataset and DataLoaders
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

train_aug_torch = A.Compose([
    A.Resize(IMG_SIZE, IMG_SIZE),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, p=0.5),
    A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ToTensorV2()
])

val_aug_torch = A.Compose([
    A.Resize(IMG_SIZE, IMG_SIZE),
    A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ToTensorV2()
])

class GSMDataset(Dataset):
    def __init__(self, frame: pd.DataFrame, transform=None):
        self.frame = frame.reset_index(drop=True)
        self.transform = transform
    def __len__(self):
        return len(self.frame)
    def __getitem__(self, idx):
        row = self.frame.loc[idx]
        p = row["image_path"]
        y = float(row[TARGET_COL])
        img = cv2.imread(p)
        if img is None:
            img = np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(image=img)["image"]
        return img, torch.tensor(y, dtype=torch.float32)

# Use augmented train if created
if os.path.exists(AUG_CSV):
    train_df_for_torch = pd.read_csv(AUG_CSV)
else:
    train_df_for_torch = X_train

val_df_for_torch = X_val
test_df_for_torch = X_test

train_ds = GSMDataset(train_df_for_torch, transform=train_aug_torch)
val_ds   = GSMDataset(val_df_for_torch,   transform=val_aug_torch)
test_ds  = GSMDataset(test_df_for_torch,  transform=val_aug_torch)

# Auto-tune batch size for larger images unless overridden
if 'BATCH_SIZE_OVERRIDE' in globals() and BATCH_SIZE_OVERRIDE is not None:
    BATCH_SIZE = int(BATCH_SIZE_OVERRIDE)
else:
    BATCH_SIZE = 16 if IMG_SIZE >= 320 else 32

NUM_WORKERS = 2 if IN_COLAB else 0

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

len(train_ds), len(val_ds), len(test_ds)

In [None]:
# 12) Define CNN Regressor (Transfer Learning)
class Regressor(nn.Module):
    def __init__(self, backbone_name='efficientnet_b0', pretrained=True):
        super().__init__()
        self.backbone = timm.create_model(backbone_name, pretrained=pretrained, num_classes=0, global_pool='avg')
        in_feats = self.backbone.num_features
        self.head = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_feats, 1)
        )
    def forward(self, x):
        f = self.backbone(x)
        out = self.head(f).squeeze(1)
        return out

# Use configured backbone
BACKBONE = globals().get('BACKBONE', 'efficientnet_b0')
model = Regressor(backbone_name=BACKBONE).to(device)
criterion = nn.HuberLoss(delta=1.0)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)

# Use configured epochs
EPOCHS = globals().get('EPOCHS', 20)
warmup_epochs = 2
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max(1, EPOCHS - warmup_epochs))

best_val_mae = float('inf')
best_ckpt = os.path.join(DATA_ROOT, 'best_model.pt')

In [None]:
# 13) Training Loop with AMP, Early Stopping, Checkpoints
from collections import defaultdict

history = defaultdict(list)
PATIENCE = globals().get('PATIENCE', 7)
no_improve = 0

for epoch in range(1, EPOCHS+1):
    model.train()
    train_losses = []
    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        if torch.cuda.is_available():
            with torch.cuda.amp.autocast():
                preds = model(xb)
                loss = criterion(preds, yb)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
        train_losses.append(loss.item())

    # Validation
    model.eval()
    val_preds, val_trues = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            if torch.cuda.is_available():
                with torch.cuda.amp.autocast():
                    preds = model(xb)
            else:
                preds = model(xb)
            val_preds.append(preds.detach().cpu().numpy())
            val_trues.append(yb.detach().cpu().numpy())
    val_preds = np.concatenate(val_preds)
    val_trues = np.concatenate(val_trues)
    val_mae = mean_absolute_error(val_trues, val_preds)

    if epoch <= warmup_epochs:
        # linear warmup: keep LR small, do not step cosine yet
        for g in optimizer.param_groups:
            g['lr'] = 2e-4 * (epoch / max(1, warmup_epochs))
    else:
        scheduler.step()

    history['train_loss'].append(np.mean(train_losses))
    history['val_mae'].append(val_mae)
    print(f"Epoch {epoch:02d}/{EPOCHS} - train_loss={np.mean(train_losses):.4f} val_mae={val_mae:.4f} lr={optimizer.param_groups[0]['lr']:.2e}")

    # Early stopping checkpoint
    if val_mae < best_val_mae - 1e-4:
        best_val_mae = val_mae
        no_improve = 0
        torch.save({'model': model.state_dict(), 'epoch': epoch}, best_ckpt)
        print("Saved best checkpoint:", best_ckpt)
    else:
        no_improve += 1
        print(f"No improvement: {no_improve}/{PATIENCE}")
        if no_improve >= PATIENCE:
            print("Early stopping triggered.")
            break

# Plot training curves
plt.figure(figsize=(6,4))
plt.plot(history['train_loss'], label='train_loss')
plt.plot(history['val_mae'], label='val_mae')
plt.legend(); plt.title('Training Curves'); plt.show()

In [None]:
# 14) Validation Metrics and Diagnostic Plots
# Load best checkpoint
if os.path.exists(best_ckpt):
    ckpt = torch.load(best_ckpt, map_location=device)
    model.load_state_dict(ckpt['model'])
    print("Loaded best model from epoch", ckpt.get('epoch'))

# Evaluate on val set
model.eval()
val_preds, val_trues = [], []
with torch.no_grad():
    for xb, yb in val_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        if torch.cuda.is_available():
            with torch.cuda.amp.autocast():
                preds = model(xb)
        else:
            preds = model(xb)
        val_preds.append(preds.detach().cpu().numpy())
        val_trues.append(yb.detach().cpu().numpy())
val_preds = np.concatenate(val_preds)
val_trues = np.concatenate(val_trues)

mae = mean_absolute_error(val_trues, val_preds)
rmse = mean_squared_error(val_trues, val_preds, squared=False)
r2 = r2_score(val_trues, val_preds)
print(f"Val MAE={mae:.4f} RMSE={rmse:.4f} R2={r2:.4f}")

# Plots
plt.figure(figsize=(5,5))
plt.scatter(val_trues, val_preds, s=10, alpha=0.7)
lims = [min(val_trues.min(), val_preds.min()), max(val_trues.max(), val_preds.max())]
plt.plot(lims, lims, 'r--')
plt.xlabel('True GSM'); plt.ylabel('Pred GSM'); plt.title('Pred vs True (Val)')
plt.show()

residuals = val_preds - val_trues
fig, axes = plt.subplots(1,2, figsize=(10,4))
sns.histplot(residuals, kde=True, ax=axes[0])
axes[0].set_title('Residual Histogram')
axes[1].scatter(val_trues, residuals, s=10, alpha=0.7)
axes[1].axhline(0, color='r', linestyle='--')
axes[1].set_xlabel('True GSM'); axes[1].set_ylabel('Residual (pred-true)')
axes[1].set_title('Residuals vs True')
plt.show()

In [None]:
# 15) Grad-CAM and Feature Map Visualizations

def grad_cam_on_image(model, image_np_rgb, target_size=IMG_SIZE):
    model.eval()
    # Preprocess
    img = cv2.resize(image_np_rgb, (target_size, target_size))
    img_norm = (img/255.0 - np.array(IMAGENET_MEAN)) / np.array(IMAGENET_STD)
    tensor = torch.from_numpy(img_norm.transpose(2,0,1)).float().unsqueeze(0).to(device)

    feats = None
    grads = None

    def fw_hook(m, i, o):
        nonlocal feats
        feats = o
    def bw_hook(m, gi, go):
        nonlocal grads
        grads = go[0]

    handle_fw = model.backbone.register_forward_hook(fw_hook)
    handle_bw = model.backbone.register_full_backward_hook(bw_hook)

    pred = model(tensor)
    # For regression, take gradient of output w.r.t. features
    model.zero_grad(set_to_none=True)
    pred.sum().backward()

    handle_fw.remove(); handle_bw.remove()

    if feats is None or grads is None:
        return None
    weights = grads.mean(dim=(2,3), keepdim=True)  # GAP over H,W
    cam = (weights * feats).sum(dim=1).squeeze(0)
    cam = cam.detach().cpu().numpy()
    cam = np.maximum(cam, 0)
    cam = cam / (cam.max() + 1e-6)
    cam = cv2.resize(cam, (image_np_rgb.shape[1], image_np_rgb.shape[0]))
    heatmap = (plt.cm.jet(cam)[:,:,:3]*255).astype(np.uint8)
    overlay = (0.5*image_np_rgb + 0.5*heatmap).astype(np.uint8)
    return cam, overlay

# Demo Grad-CAM on few validation images
samples = val_df_for_torch.sample(min(4, len(val_df_for_torch)), random_state=0)
plt.figure(figsize=(10,8))
for i, (_, row) in enumerate(samples.iterrows(), 1):
    img = cv2.imread(row['image_path'])
    if img is None: continue
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    res = grad_cam_on_image(model, img)
    if res is None: continue
    cam, overlay = res
    plt.subplot(2,2,i)
    plt.imshow(overlay)
    plt.title(f"Grad-CAM: True GSM={row[TARGET_COL]:.2f}")
    plt.axis('off')
plt.tight_layout(); plt.show()

In [None]:
# 16) K-Fold Cross-Validation (Optional)
RUN_KFOLD = False
N_SPLITS = 5

# This is a light template; set RUN_KFOLD=True to run (can be time-consuming)
if RUN_KFOLD:
    folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    # Use bins for stratification
    bins_all = pd.qcut(df[TARGET_COL], q=5, labels=False, duplicates='drop')
    maes = []
    for fold, (tr_idx, va_idx) in enumerate(folds.split(df, bins_all)):
        tr_df = df.iloc[tr_idx]; va_df = df.iloc[va_idx]
        tr_ds = GSMDataset(tr_df, transform=train_aug_torch)
        va_ds = GSMDataset(va_df, transform=val_aug_torch)
        tr_loader = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
        va_loader = DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

        fold_model = Regressor().to(device)
        opt = torch.optim.AdamW(fold_model.parameters(), lr=2e-4, weight_decay=1e-4)
        sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=10)
        best = float('inf')
        for ep in range(1, 8):
            fold_model.train()
            for xb, yb in tr_loader:
                xb, yb = xb.to(device), yb.to(device)
                opt.zero_grad(set_to_none=True)
                if torch.cuda.is_available():
                    with torch.cuda.amp.autocast():
                        pr = fold_model(xb); ls = criterion(pr, yb)
                    scaler.scale(ls).backward(); scaler.step(opt); scaler.update()
                else:
                    pr = fold_model(xb); ls = criterion(pr, yb)
                    ls.backward(); opt.step()
            # quick val
            fold_model.eval()
            vp, vt = [], []
            with torch.no_grad():
                for xb, yb in va_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pr = fold_model(xb)
                    vp.append(pr.detach().cpu().numpy()); vt.append(yb.detach().cpu().numpy())
            vp = np.concatenate(vp); vt = np.concatenate(vt)
            mae_fold = mean_absolute_error(vt, vp)
            best = min(best, mae_fold)
        maes.append(best)
        print(f"Fold {fold}: best MAE {best:.4f}")
    print("K-Fold MAE mean +/- std:", np.mean(maes), np.std(maes))
else:
    print("Set RUN_KFOLD=True to enable cross-validation.")

In [None]:
# 17) Final Test Evaluation and Lightweight Unit Tests
# Simple unit checks
a_img, a_lbl = train_ds[0]
assert a_img.shape[0] == 3 and a_img.ndim == 3, "Tensor shape mismatch"
assert torch.isfinite(a_img).all(), "Found non-finite values in image tensor"
assert isinstance(a_lbl.item(), float), "Label must be float for regression"

# Test evaluation
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        pr = model(xb)
        test_preds.append(pr.detach().cpu().numpy())
        test_trues.append(yb.detach().cpu().numpy())

test_preds = np.concatenate(test_preds)
test_trues = np.concatenate(test_trues)

t_mae = mean_absolute_error(test_trues, test_preds)
t_rmse = mean_squared_error(test_trues, test_preds, squared=False)
t_r2 = r2_score(test_trues, test_preds)
print(f"Test MAE={t_mae:.4f} RMSE={t_rmse:.4f} R2={t_r2:.4f}")

# Save a CSV of test predictions
OUT_DIR = os.path.join(DATA_ROOT, 'outputs')
os.makedirs(OUT_DIR, exist_ok=True)
TEST_PRED_CSV = os.path.join(OUT_DIR, 'test_predictions.csv')
pd.DataFrame({'image_path': test_df_for_torch['image_path'].values, 'true_gsm': test_trues, 'pred_gsm': test_preds}).to_csv(TEST_PRED_CSV, index=False)
print('Wrote:', TEST_PRED_CSV)

In [None]:
# 18) Export Model, Inference Function, and Colab/Drive Integration
# Save best model locally
FINAL_DIR = os.path.join(DATA_ROOT, 'artifacts')
os.makedirs(FINAL_DIR, exist_ok=True)
FINAL_MODEL = os.path.join(FINAL_DIR, 'gsm_regressor.pt')

if os.path.exists(best_ckpt):
    # copy or re-save
    torch.save({'model': model.state_dict()}, FINAL_MODEL)
    print('Saved:', FINAL_MODEL)

# Optionally save to Drive when in Colab
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_OUT = "/content/drive/MyDrive/gsm_artifacts"
    os.makedirs(DRIVE_OUT, exist_ok=True)
    !cp -v "$FINAL_MODEL" "$DRIVE_OUT/"

# Inference utilities
@torch.no_grad()
def predict_image(path: str) -> float:
    model.eval()
    img = cv2.imread(path)
    if img is None:
        raise FileNotFoundError(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = (img/255.0 - np.array(IMAGENET_MEAN)) / np.array(IMAGENET_STD)
    ten = torch.from_numpy(img.transpose(2,0,1)).float().unsqueeze(0).to(device)
    if torch.cuda.is_available():
        with torch.cuda.amp.autocast():
            pr = model(ten)
    else:
        pr = model(ten)
    return float(pr.squeeze().detach().cpu().item())

# Demo inference on a few test images
for p in test_df_for_torch['image_path'].sample(min(3, len(test_df_for_torch)), random_state=7):
    pred = predict_image(p)
    print(os.path.basename(p), '-> Pred GSM =', round(pred, 3))

# Save minimal requirements for reproducibility
REQ_TXT = os.path.join(FINAL_DIR, 'requirements.txt')
with open(REQ_TXT, 'w') as f:
    f.write('\n'.join([
        'torch', 'torchvision', 'timm', 'albumentations', 'opencv-python-headless',
        'pandas', 'numpy', 'scikit-learn', 'umap-learn', 'matplotlib', 'seaborn', 'tqdm'
    ]))
print('Saved requirements:', REQ_TXT)

In [None]:
# Baseline: Scikit-learn on Handcrafted Features
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Build features for train/val jointly to train baseline
base_df = pd.concat([X_train, X_val], ignore_index=True)
Xf, yf = [], []
for _, r in tqdm(base_df.iterrows(), total=len(base_df), desc='Baseline features'):
    f = compute_features(r['image_path'])
    if f is not None:
        Xf.append(f); yf.append(float(r[TARGET_COL]))
Xf = np.vstack(Xf); yf = np.array(yf)

rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
cv = 5
scores = cross_val_score(rf, Xf, yf, cv=cv, scoring='neg_mean_absolute_error')
print(f"Baseline RF {cv}-fold MAE: {(-scores).mean():.4f} +/- {(-scores).std():.4f}")
rf.fit(Xf, yf)

# Eval on test
Xt, yt = [], []
for _, r in tqdm(X_test.iterrows(), total=len(X_test), desc='Baseline test features'):
    f = compute_features(r['image_path'])
    if f is not None:
        Xt.append(f); yt.append(float(r[TARGET_COL]))
Xt = np.vstack(Xt); yt = np.array(yt)
yp = rf.predict(Xt)
print("Baseline Test MAE=%.4f RMSE=%.4f R2=%.4f" % (
    mean_absolute_error(yt, yp), mean_squared_error(yt, yp, squared=False), r2_score(yt, yp)
))

In [None]:
# 3c) Autodetect CSV, image directory, and key columns (after download)
import glob

# Try to find a CSV under DATA_ROOT
csv_candidates = glob.glob(os.path.join(DATA_ROOT, "**", "*.csv"), recursive=True)
CSV_PATH = csv_candidates[0] if csv_candidates else None
print("Detected CSV:", CSV_PATH)

# Heuristic: find an image directory by scanning for many images
img_exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
image_files = [p for p in glob.glob(os.path.join(DATA_ROOT, "**", "*"), recursive=True)
               if os.path.splitext(p)[1].lower() in img_exts]

from collections import Counter
parent_counts = Counter([os.path.dirname(p) for p in image_files])
IMAGE_DIR = None
if parent_counts:
    IMAGE_DIR = max(parent_counts.items(), key=lambda kv: kv[1])[0]
print("Detected IMAGE_DIR:", IMAGE_DIR)

# Detect likely image and target columns
IMAGE_COL = None
TARGET_COL = 'gsm'
ALT_MASS_COL = 'mass'
AREA_COL = 'area_m2'

if CSV_PATH and os.path.exists(CSV_PATH):
    tmp = pd.read_csv(CSV_PATH)
    cols = [c.lower() for c in tmp.columns]
    # Image column heuristic
    for key in ["image", "file", "filename", "path"]:
        matches = [c for c in tmp.columns if key in c.lower()]
        if matches:
            IMAGE_COL = matches[0]
            break
    # Target column heuristic
    gsm_like = [c for c in tmp.columns if "gsm" in c.lower() or "grammage" in c.lower() or "basis" in c.lower()]
    if gsm_like:
        TARGET_COL = gsm_like[0]
    else:
        # mass-like fallback
        mass_like = [c for c in tmp.columns if any(k in c.lower() for k in ["mass","weight","wt"]) ]
        if mass_like:
            ALT_MASS_COL = mass_like[0]
        # area-like fallback
        area_like = [c for c in tmp.columns if "area" in c.lower()]
        if area_like:
            AREA_COL = area_like[0]

print("IMAGE_COL:", IMAGE_COL, "TARGET_COL:", TARGET_COL, "ALT_MASS_COL:", ALT_MASS_COL, "AREA_COL:", AREA_COL)

# Persist detected settings for downstream cells
DETECTED_SETTINGS = {
    'CSV_PATH': CSV_PATH,
    'IMAGE_DIR': IMAGE_DIR,
    'IMAGE_COL': IMAGE_COL,
    'TARGET_COL': TARGET_COL,
    'ALT_MASS_COL': ALT_MASS_COL,
    'AREA_COL': AREA_COL,
}
print("Settings:", DETECTED_SETTINGS)

In [None]:
# 10d) Deep Embeddings (timm backbone) + UMAP
# Extract penultimate-layer embeddings from a pretrained backbone and visualize separability
import umap

@torch.no_grad()
def extract_backbone_embeddings(paths, backbone_name='efficientnet_b0'):
    model_fe = timm.create_model(backbone_name, pretrained=True, num_classes=0, global_pool='avg').to(device).eval()
    embs = []
    for p in tqdm(paths, desc='Embeddings'):
        img = cv2.imread(p)
        if img is None:
            continue
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        img = (img/255.0 - np.array(IMAGENET_MEAN)) / np.array(IMAGENET_STD)
        ten = torch.from_numpy(img.transpose(2,0,1)).float().unsqueeze(0).to(device)
        if torch.cuda.is_available():
            with torch.cuda.amp.autocast():
                f = model_fe(ten)
        else:
            f = model_fe(ten)
        embs.append(f.detach().cpu().numpy())
    if len(embs) == 0:
        return None
    return np.vstack(embs)

subset_deep = df.sample(min(400, len(df)), random_state=7).reset_index(drop=True)
X_deep = extract_backbone_embeddings(subset_deep['image_path'].tolist())
if X_deep is not None:
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
    embd = reducer.fit_transform(X_deep)
    plt.figure(figsize=(6,5))
    sc = plt.scatter(embd[:,0], embd[:,1], c=subset_deep[TARGET_COL].values[:len(embd)], cmap='viridis', s=10)
    plt.title('UMAP of Deep Embeddings (pretrained backbone)')
    plt.colorbar(sc, label='GSM')
    plt.show()
else:
    print('No embeddings extracted (no readable images).')

In [None]:
# 10e) Inspect Feature Types, Names, and Example Values
# This cell prints the names for each feature we extract and shows sample values.

def get_simple_feature_names(img_size=IMG_SIZE, hist_bins=32):
    names = []
    for c in range(3):
        names += [f"c{c}_mean", f"c{c}_std", f"c{c}_skew"]
    for c in range(3):
        for b in range(hist_bins):
            names.append(f"c{c}_hist_bin{b}")
    names += ["laplacian_var", "sobel_energy"]
    return names

def get_glcm_lbp_feature_names(distances=(1,2,4), angles=(0, np.pi/4, np.pi/2, 3*np.pi/4)):
    props = ['contrast','dissimilarity','homogeneity','energy','correlation','ASM']
    names = []
    for p in props:
        for d in distances:
            for a in angles:
                adeg = int(np.rad2deg(a))
                names.append(f"glcm_{p}_d{d}_a{adeg}")
    # LBP histogram bins: P+2 (with P=8 in compute_features_glcm_lbp)
    P = 8
    n_bins = P + 2
    for b in range(n_bins):
        names.append(f"lbp_bin{b}")
    return names

# Print simple feature info
simple_names = get_simple_feature_names()
print(f"Simple features count: {len(simple_names)}")
print("Simple features (first 20):", simple_names[:20], "...")

# Show example values for one image
if len(df) > 0:
    ex_path = df.iloc[0]['image_path']
    fv = compute_features(ex_path)
    if fv is not None:
        print(f"Example simple feature vector length: {len(fv)}")
        for n, v in list(zip(simple_names, fv))[:20]:
            print(f"  {n}: {float(v):.4f}")

# Print texture feature info if available
if 'compute_features_glcm_lbp' in globals() and HAVE_SKIMAGE:
    glcm_lbp_names = get_glcm_lbp_feature_names()
    print(f"GLCM+LBP features count: {len(glcm_lbp_names)}")
    print("GLCM+LBP features (first 20):", glcm_lbp_names[:20], "...")
    if len(df) > 0:
        ex_path = df.iloc[0]['image_path']
        fv2 = compute_features_glcm_lbp(ex_path)
        if fv2 is not None:
            print(f"Example GLCM+LBP feature vector length: {len(fv2)}")
            for n, v in list(zip(glcm_lbp_names, fv2))[:20]:
                print(f"  {n}: {float(v):.4f}")
else:
    print("GLCM/LBP not available; install scikit-image to enable.")

# Deep embedding dimension (from pretrained backbone)
try:
    tmp_backbone = timm.create_model('efficientnet_b0', pretrained=True, num_classes=0, global_pool='avg')
    print("Deep embedding (efficientnet_b0) dim:", tmp_backbone.num_features)
except Exception as e:
    print("Could not inspect deep embedding dim:", e)

# Optional: quick feature importance on simple features
RUN_FEATURE_IMPORTANCE = False
if RUN_FEATURE_IMPORTANCE:
    Xf, yf = [], []
    for _, r in tqdm(df.sample(min(300, len(df)), random_state=0).iterrows(), total=min(300, len(df)), desc='Imp feats'):
        f = compute_features(r['image_path'])
        if f is not None:
            Xf.append(f); yf.append(float(r[TARGET_COL]))
    if Xf:
        from sklearn.ensemble import RandomForestRegressor
        Xf = np.vstack(Xf); yf = np.array(yf)
        rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
        rf.fit(Xf, yf)
        importances = rf.feature_importances_
        idx = np.argsort(importances)[::-1][:20]
        plt.figure(figsize=(8,5))
        plt.barh([simple_names[i] for i in idx][::-1], importances[idx][::-1])
        plt.title('Top-20 Feature Importances (Simple Features)')
        plt.tight_layout(); plt.show()