#  DATA PREPROCESSING

#  Setup Libraries


In [None]:
# ============================================
# Libraries & Packages (Imports in One Cell)
# ============================================

# --- System & Utilities
import os, sys, glob, time, random, shutil, platform, warnings, json, csv, pickle
from pathlib import Path, PureWindowsPath
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
warnings.filterwarnings("ignore")

# --- Numerical & Data Handling
import numpy as np
import pandas as pd

# --- Visualization
import matplotlib.pyplot as plt
from matplotlib.image import imread
from IPython.display import display

# --- Image Processing
import cv2
from PIL import Image

# --- Deep Learning (TensorFlow / Keras)
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, 
                                     GlobalAveragePooling2D, Dense, Dropout)
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import (ModelCheckpoint, EarlyStopping, 
                                        ReduceLROnPlateau)
from tensorflow.keras.metrics import CategoricalAccuracy

# --- Machine Learning (Scikit-learn)
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_curve, auc, f1_score, accuracy_score)

# ============================================
print("[INFO] All libraries successfully imported.")


# GPU Setup (TensorFlow)
We enable *memory growth* on all visible GPUs. This avoids TensorFlow pre-allocating all VRAM and prevents OOM issues when other processes (or notebooks) share the GPU.


In [2]:
# ========== 1) GPU: safe memory growth ==========
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"[GPU] Using {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(e)
else:
    print("[GPU] No GPU detected; running on CPU")

[GPU] No GPU detected; running on CPU


# Dataset Root & Split Discovery
This cell sets the dataset root and robustly resolves paths (Windows/WSL/OneDrive).
It then detects the `train/`, `val/`, and `test/` folders and verifies that all
expected OCT classes (`CNV`, `DME`, `DRUSEN`, `NORMAL`) are present.

If the data lives directly under the root as class folders (no `train/val/test`),
the code falls back to treating the root as the training set.


In [None]:
# ========== 2) Data paths (auto-detect common names) ==========
data_dir = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN"  # <--- change if needed

# Expected classes (define early because later logic checks these)
categories = ["CNV", "DME", "DRUSEN", "NORMAL"]
expected = set(categories)

# If the chosen data_dir doesn't exist (WSL vs Windows path mismatch), try fallbacks:
if not os.path.isdir(data_dir):
    print(f"[WARN] data_dir not found: {data_dir}")

    # 1) Try known notebook variable INPUT_ROOT (Windows style) if available
    try:
        if 'INPUT_ROOT' in globals() and INPUT_ROOT and os.path.isdir(str(INPUT_ROOT)):
            data_dir = str(INPUT_ROOT)
            print(f"[INFO] Falling back to INPUT_ROOT: {data_dir}")
    except Exception:
        pass

    # 2) Try converting /mnt/<drive>/... -> C:\... (common WSL<->Windows mismatch)
    if not os.path.isdir(data_dir):
        try:
            p = Path(data_dir)
            parts = p.parts
            # expected form: ('/', 'mnt', 'c', 'Users', ...)
            if len(parts) > 2 and parts[0] == '/' and parts[1] == 'mnt':
                drive = parts[2]
                if len(drive) == 1:
                    drive_letter = f"{drive.upper()}:"
                    win_path = Path(drive_letter, *parts[3:])
                    if os.path.isdir(str(win_path)):
                        data_dir = str(win_path)
                        print(f"[INFO] Found Windows-equivalent path: {data_dir}")
        except Exception:
            pass

    # 3) Try a more permissive replacement of the prefix (covers some variants)
    if not os.path.isdir(data_dir):
        try:
            s = data_dir
            if s.startswith('/mnt/'):
                # replace '/mnt/c/...' -> 'C:/...'
                parts = s.split('/')
                if len(parts) > 2 and len(parts[2]) == 1:
                    drive_letter = parts[2].upper() + ':'
                    rest = "/".join(parts[3:])
                    alt = os.path.join(drive_letter, rest.replace('/', os.sep))
                    if os.path.isdir(alt):
                        data_dir = alt
                        print(f"[INFO] Found alternative Windows path: {data_dir}")
        except Exception:
            pass

if not os.path.isdir(data_dir):
    raise FileNotFoundError(
        f"Could not find dataset root. Checked data_dir and fallbacks. "
        f"Original data_dir: {data_dir}. "
        f"Please set data_dir to the correct dataset root or adjust the path."
    )

def _pick_existing(root, candidates):
    for name in candidates:
        p = os.path.join(root, name)
        if os.path.isdir(p):
            return p, name
    return None, None

train_path, train_name = _pick_existing(data_dir, ["train","training","Train","TRAIN"])
val_path,   val_name   = _pick_existing(data_dir, ["val","validation","valid","Val","Validation"])
test_path,  test_name  = _pick_existing(data_dir, ["test","testing","Test","Testing"])

# If train/test directories are not present directly under data_dir, it's possible
# the dataset layout is class folders directly under data_dir (no split folders).
# In that case treat data_dir as 'train' and leave test unset so later logic can use KFold.
if not train_path and any(os.path.isdir(os.path.join(data_dir, d)) for d in os.listdir(data_dir)):
    # check if data_dir itself contains class subfolders matching expected classes
    subdirs = {d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))}
    if expected.issubset(subdirs):
        train_path, train_name = data_dir, os.path.basename(data_dir)
        print(f"[INFO] Using {data_dir} as training root (found class folders directly).")

if not train_path or not test_path:
    raise FileNotFoundError(
        f"Could not find train/test under {data_dir}. "
        f"Tried train/training and test/testing. If your dataset uses a single split (all images under class folders), set data_dir accordingly."
    )

print(f"[DATA] train: {train_name} -> {train_path}")
print(f"[DATA]  val : {val_name or 'NONE (KFold)'} -> {val_path}")
print(f"[DATA] test : {test_name} -> {test_path}")

def _check_classes(path):
    sub = {d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))}
    missing = expected - sub
    if missing:
        print(f"[WARN] {path} missing classes: {sorted(missing)}")
    else:
        print(f"[OK] Classes at {path}: {sorted(sub)}")
_check_classes(train_path)
_check_classes(test_path)


[DATA] train: train -> C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN\train
[DATA]  val : val -> C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN\val
[DATA] test : test -> C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN\test
[OK] Classes at C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN\train: ['CNV', 'DME', 'DRUSEN', 'NORMAL']
[OK] Classes at C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN\test: ['CNV', 'DME', 'DRUSEN', 'NORMAL']


# Preprocessing Step 1: Resize to 224×224 (Aspect-Ratio Preserved)
We resize all images to **224×224** using *letterboxing* (black padding). This
standardizes input sizes without cropping retinal tissue. The output mirrors the
original folder structure and is written to `OCT2017_Resize`.


In [None]:
import cv2
from pathlib import Path

# -------- CONFIG --------
INPUT_ROOT  = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN"
OUTPUT_ROOT = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_Resize"
TARGET_SIZE = (224, 224)
EXTS = {".png", ".jpg", ".jpeg"}   # accepted formats
# ------------------------

def resize_with_padding(img, target_size=(224, 224)):
    h, w = img.shape[:2]
    tw, th = target_size
    scale = min(tw / w, th / h)
    nw, nh = int(w * scale), int(h * scale)
    resized = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)
    dw, dh = tw - nw, th - nh
    top, bottom = dh // 2, dh - dh // 2
    left, right = dw // 2, dw - dw // 2
    return cv2.copyMakeBorder(resized, top, bottom, left, right,
                              cv2.BORDER_CONSTANT, value=[0, 0, 0])

def process_dataset(in_root, out_root):
    in_root, out_root = Path(in_root), Path(out_root)
    all_imgs = [p for p in in_root.rglob("*") if p.suffix.lower() in EXTS]

    if not in_root.exists():
        print("❌ INPUT_ROOT not found:", in_root)
        return

    print(f"Found {len(all_imgs)} images under {in_root}")
    for i, src in enumerate(all_imgs, 1):
        rel = src.relative_to(in_root)
        dst = out_root / rel
        dst.parent.mkdir(parents=True, exist_ok=True)

        img = cv2.imread(str(src))
        if img is None:
            print("⚠️ Skipping unreadable file:", src)
            continue
        out = resize_with_padding(img, TARGET_SIZE)
        cv2.imwrite(str(dst), out)

        if i % 100 == 0:
            print(f"Processed {i}/{len(all_imgs)}")

    print("✅ All images saved to:", out_root)

process_dataset(INPUT_ROOT, OUTPUT_ROOT)


Found 84484 images under C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\research\dataset\OCT2017_CLEAN
Processed 100/84484
Processed 200/84484
Processed 300/84484
Processed 400/84484
Processed 500/84484
Processed 600/84484
Processed 700/84484
Processed 800/84484
Processed 900/84484
Processed 1000/84484
Processed 1100/84484
Processed 1200/84484
Processed 1300/84484
Processed 1400/84484
Processed 1500/84484
Processed 1600/84484
Processed 1700/84484
Processed 1800/84484
Processed 1900/84484
Processed 2000/84484
Processed 2100/84484
Processed 2200/84484
Processed 2300/84484
Processed 2400/84484
Processed 2500/84484
Processed 2600/84484
Processed 2700/84484
Processed 2800/84484
Processed 2900/84484
Processed 3000/84484
Processed 3100/84484
Processed 3200/84484
Processed 3300/84484
Processed 3400/84484
Processed 3500/84484
Processed 3600/84484
Processed 3700/84484
Processed 3800/84484
Processed 3900/84484
Processed 4000/84484
Processed 4100/84484
Processed 4200/84484
Processed 4300

# Preprocessing Step 2: White-Border Removal
Some pipelines introduce white borders during previous processing or export.
These high-intensity edges can bias the model and distort normalization.

We detect **white regions that touch the image border** via connected components
and set them to **black**. Outputs are saved to `OCT2017_border`. A summary reports
how many files were modified per split/class.


In [None]:
import cv2
import numpy as np
from pathlib import Path
from time import time
from collections import defaultdict

# ---------- CONFIG ----------
INPUT_ROOT  = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_Resize"
OUTPUT_ROOT = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_border"
EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}
OVERWRITE = False
WHITE_THRESH = 250  # pixels >= this are considered 'white' for border detection
# ---------------------------

def fill_white_border_black(img, white_thresh=250):
    """
    Replace white regions that touch the image border with black.
    Preserves grayscale or color (BGR) shape.
    Returns: (modified_img, changed_pixels_count)
    """
    h, w = img.shape[:2]

    # Build 'white' mask (grayscale or color)
    if img.ndim == 3 and img.shape[2] == 3:
        white_mask = np.all(img >= white_thresh, axis=2)
    else:
        white_mask = (img >= white_thresh)

    if not np.any(white_mask):
        return img, 0

    white_u8 = (white_mask.astype(np.uint8) * 255)
    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(white_u8, connectivity=8)

    # find components that touch any border
    border_labels = []
    for lbl in range(1, num_labels):  # skip background
        left = stats[lbl, cv2.CC_STAT_LEFT]
        top = stats[lbl, cv2.CC_STAT_TOP]
        width = stats[lbl, cv2.CC_STAT_WIDTH]
        height = stats[lbl, cv2.CC_STAT_HEIGHT]
        if left == 0 or top == 0 or (left + width) == w or (top + height) == h:
            border_labels.append(lbl)

    if len(border_labels) == 0:
        # fallback: keep only whites that lie exactly on the outermost edges
        full_border_mask = np.zeros_like(white_mask, dtype=bool)
        full_border_mask[0, :]  = white_mask[0, :]
        full_border_mask[-1, :] = white_mask[-1, :]
        full_border_mask[:, 0]  = full_border_mask[:, 0]  | white_mask[:, 0]
        full_border_mask[:, -1] = full_border_mask[:, -1] | white_mask[:, -1]
    else:
        # include all pixels belonging to border-touching white components
        full_border_mask = np.isin(labels, border_labels)

    target_mask = white_mask & full_border_mask
    changed = int(np.count_nonzero(target_mask))
    if changed == 0:
        return img, 0

    out = img.copy()
    if out.ndim == 3:
        out[target_mask, :] = 0
    else:
        out[target_mask] = 0
    return out, changed

def iter_images(root, exts):
    root = Path(root)
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            yield p

def main():
    in_root  = Path(INPUT_ROOT)
    out_root = Path(OUTPUT_ROOT)
    out_root.mkdir(parents=True, exist_ok=True)

    if not in_root.exists():
        print("❌ INPUT_ROOT does not exist:", in_root)
        print("Tip: If this is a OneDrive folder, right-click it in Explorer and choose 'Always keep on this device'.")
        return

    files = list(iter_images(in_root, EXTS))
    total = len(files)
    if total == 0:
        print("❌ No images found under:", in_root)
        return

    print(f"Found {total} images. Writing to: {out_root}")
    t0 = time()
    processed = skipped = errors = 0
    changed_pixels_total = 0

    # Optional per (split,class) counts just for reporting
    buckets = defaultdict(lambda: {"count":0, "changed":0})

    def progress(i, n, bar_len=30):
        frac = (i + 1) / n
        bar = "█" * int(bar_len * frac) + "·" * (bar_len - int(bar_len * frac))
        print(f"\r[{bar}] {i+1}/{n}", end="", flush=True)

    for i, src in enumerate(files):
        rel = src.relative_to(in_root)
        dst = out_root / rel
        dst.parent.mkdir(parents=True, exist_ok=True)

        # bucket label for quick sanity: e.g., train/CNV or val/DME
        parts = rel.parts
        if len(parts) >= 2 and parts[0].lower() in {"train","val","test"}:
            bucket = f"{parts[0]}/{parts[1]}"
        elif len(parts) >= 1:
            bucket = parts[0]
        else:
            bucket = "root"

        if dst.exists() and not OVERWRITE:
            skipped += 1
            buckets[bucket]["count"] += 1
            progress(i, total)
            continue

        img = cv2.imread(str(src), cv2.IMREAD_UNCHANGED)
        if img is None:
            errors += 1
            progress(i, total)
            continue

        try:
            out, changed = fill_white_border_black(img, white_thresh=WHITE_THRESH)
            cv2.imwrite(str(dst), out)
            processed += 1
            changed_pixels_total += changed
            buckets[bucket]["count"] += 1
            buckets[bucket]["changed"] += (1 if changed > 0 else 0)
        except Exception:
            errors += 1
        finally:
            progress(i, total)

    dt = time() - t0
    print("\n\n✅ Done.")
    print(f"Processed images : {processed}")
    print(f"Skipped (exists) : {skipped}")
    print(f"Errors           : {errors}")
    print(f"Pixels changed   : {changed_pixels_total}")
    print(f"Elapsed          : {dt:.1f}s")

    # Small summary per bucket
    print("\nSummary per split/class (files seen, files modified):")
    for k in sorted(buckets.keys()):
        print(f"  {k:<20}  {buckets[k]['count']:>6}  modified:{buckets[k]['changed']:>6}")

if __name__ == "__main__":
    main()


Found 84484 images. Writing to: C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_border
[██████████████████████████████] 84484/84484

✅ Done.
Processed images : 84484
Skipped (exists) : 0
Errors           : 0
Pixels changed   : 160997002
Elapsed          : 2064.0s

Summary per split/class (files seen, files modified):
  test/CNV                 242  modified:   181
  test/DME                 242  modified:   147
  test/DRUSEN              242  modified:   204
  test/NORMAL              242  modified:   221
  train/CNV              37205  modified: 21571
  train/DME              11348  modified:  7033
  train/DRUSEN            8616  modified:  5666
  train/NORMAL           26315  modified: 16548
  val/CNV                    8  modified:     5
  val/DME                    8  modified:     5
  val/DRUSEN                 8  modified:     6
  val/NORMAL                 8  modified:     7


# Split Step 1: Stratified Train/Val (from Border-Cleaned Set)
We construct a new validation set (default **15%**) *per class* from the border-cleaned dataset.
Optionally, the old `val/` is merged back into the pool to increase sample variety.

- Input: `OCT2017_border`
- Output: `OCT2017_STRATIFIED_BORDER`
- Test is **copied unchanged**.


In [None]:
import os
import shutil
import random
from pathlib import Path
from collections import defaultdict

# ================== CONFIG ==================
# Use your border-cleaned dataset as input
INPUT_ROOT   = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_border"

# Output root where new train/ val/ test/ will be created
OUTPUT_ROOT  = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_STRATIFIED_BORDER"

VAL_RATIO    = 0.15     # 15% of (train + old val, if included) -> validation
INCLUDE_OLD_VAL_IN_POOL = True  # True = merge INPUT_ROOT/val back into pool before splitting
COPY_TEST    = True     # copy INPUT_ROOT/test to OUTPUT_ROOT/test as-is

RANDOM_SEED  = 42
CLEAN_OUTPUT = False    # CAUTION: if True, deletes OUTPUT_ROOT before writing
EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}
# ============================================

random.seed(RANDOM_SEED)

in_root   = Path(INPUT_ROOT)
out_root  = Path(OUTPUT_ROOT)
train_in  = in_root / "train"
val_in    = in_root / "val"
test_in   = in_root / "test"

train_out = out_root / "train"
val_out   = out_root / "val"
test_out  = out_root / "test"

def list_images(folder: Path):
    return sorted([p for p in folder.rglob("*") if p.is_file() and p.suffix.lower() in EXTS])

def copy_files(files, dst_dir: Path):
    dst_dir.mkdir(parents=True, exist_ok=True)
    for f in files:
        shutil.copy2(f, dst_dir / f.name)

def copy_tree(src: Path, dst: Path):
    if not src.exists():
        print(f"⚠️ Source not found, skipping copy: {src}")
        return
    for cls_dir in [d for d in src.iterdir() if d.is_dir()]:
        cls_out = dst / cls_dir.name
        cls_out.mkdir(parents=True, exist_ok=True)
        for f in list_images(cls_dir):
            shutil.copy2(f, cls_out / f.name)

def collect_by_class(root: Path):
    """Return dict: class_name -> [Path, ...] from immediate subfolders."""
    if not root.exists():
        return {}
    classes = [d for d in root.iterdir() if d.is_dir()]
    by_class = {}
    for c in classes:
        imgs = list_images(c)
        if imgs:
            by_class[c.name] = imgs
    return by_class

# ---------- Safety & setup ----------
if not in_root.exists():
    raise FileNotFoundError(f"INPUT_ROOT not found: {in_root}\n"
                            f"Tip: If OneDrive, right-click folder in Explorer -> 'Always keep on this device'.")

if out_root.exists() and CLEAN_OUTPUT:
    print(f"⚠️ Removing existing output: {out_root}")
    shutil.rmtree(out_root)

train_out.mkdir(parents=True, exist_ok=True)
val_out.mkdir(parents=True, exist_ok=True)
if COPY_TEST:
    test_out.mkdir(parents=True, exist_ok=True)

# ---------- Gather pool for stratified split ----------
pool_by_class = defaultdict(list)

# add TRAIN
train_by_class = collect_by_class(train_in)
if not train_by_class:
    raise RuntimeError(f"No class subfolders or images found under {train_in}")

for cls, files in train_by_class.items():
    pool_by_class[cls].extend(files)

# optionally add old VAL into pool
if INCLUDE_OLD_VAL_IN_POOL and val_in.exists():
    val_by_class = collect_by_class(val_in)
    for cls, files in val_by_class.items():
        pool_by_class[cls].extend(files)

# ---------- Stratified split ----------
summary = defaultdict(lambda: {"pool":0, "train":0, "val":0})

for cls, files in pool_by_class.items():
    if len(files) == 0:
        print(f"⚠️ No images for class '{cls}', skipping.")
        continue

    random.shuffle(files)
    n_total = len(files)
    n_val = max(1, int(round(n_total * VAL_RATIO)))

    val_files   = files[:n_val]
    train_files = files[n_val:]

    # write out
    cls_train_out = train_out / cls
    cls_val_out   = val_out / cls
    cls_train_out.mkdir(parents=True, exist_ok=True)
    cls_val_out.mkdir(parents=True, exist_ok=True)

    for src in train_files:
        shutil.copy2(src, cls_train_out / src.name)
    for src in val_files:
        shutil.copy2(src, cls_val_out / src.name)

    # summary
    summary[cls]["pool"]  = n_total
    summary[cls]["train"] = len(train_files)
    summary[cls]["val"]   = len(val_files)

# ---------- Copy test split unchanged ----------
if COPY_TEST and test_in.exists():
    for cls_dir in [d for d in test_in.iterdir() if d.is_dir()]:
        copy_files(list_images(cls_dir), test_out / cls_dir.name)

# ---------- Report ----------
grand_pool  = sum(v["pool"]  for v in summary.values())
grand_train = sum(v["train"] for v in summary.values())
grand_val   = sum(v["val"]   for v in summary.values())

print("\n✅ Stratified split complete (using BORDER dataset).")
print(f"Output root: {out_root}")
print(f" New train/: {grand_train} images")
print(f" New val/  : {grand_val} images")
if COPY_TEST and test_in.exists():
    test_count = len(list_images(test_out))
    print(f" test/     : {test_count} images (copied from input/test)")
print("\nPer-class breakdown (pool -> train / val):")
for cls in sorted(summary.keys()):
    s = summary[cls]
    r = (s["val"] / max(1, s["pool"])) * 100.0
    print(f"  {cls:<10} pool={s['pool']:>6}  train={s['train']:>6}  val={s['val']:>6}  (val {r:4.1f}%)")



✅ Stratified split complete (using BORDER dataset).
Output root: C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_STRATIFIED_BORDER
 New train/: 70989 images
 New val/  : 12527 images
 test/     : 968 images (copied from input/test)

Per-class breakdown (pool -> train / val):
  CNV        pool= 37213  train= 31631  val=  5582  (val 15.0%)
  DME        pool= 11356  train=  9653  val=  1703  (val 15.0%)
  DRUSEN     pool=  8624  train=  7330  val=  1294  (val 15.0%)
  NORMAL     pool= 26323  train= 22375  val=  3948  (val 15.0%)


# Split Step 2: Global 70/15/15 (Train/Val/Test)
For a clean experimental baseline, we pool **train/val/test** per class and
re-split into **70% Train / 15% Val / 15% Test**. This produces balanced split
sizes at the dataset level while preserving class stratification.

- Input: `OCT2017_STRATIFIED_BORDER`
- Output: `OCT2017_70_15_15`
- Duplicate filenames are auto-resolved by appending a suffix.


In [None]:
import os
import shutil
import random
from pathlib import Path
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

# =============== CONFIG ===============
INPUT_ROOT  = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_STRATIFIED_BORDER"
OUTPUT_ROOT = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_70_15_15"

# Target split ratios (must sum to 1.0)
TRAIN_RATIO = 0.70
VAL_RATIO   = 0.15
TEST_RATIO  = 0.15

RANDOM_SEED = 42
EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}  # add if needed
CLEAN_OUTPUT = False   # CAUTION: if True, deletes OUTPUT_ROOT first
MAX_WORKERS = 8        # threads for faster copying (I/O bound)
# ======================================

assert abs((TRAIN_RATIO + VAL_RATIO + TEST_RATIO) - 1.0) < 1e-6, "Ratios must sum to 1.0"
random.seed(RANDOM_SEED)

in_root = Path(INPUT_ROOT)
out_root = Path(OUTPUT_ROOT)
train_out = out_root / "train"
val_out   = out_root / "val"
test_out  = out_root / "test"

splits_in = [in_root / "train", in_root / "val", in_root / "test"]

def list_images(root: Path):
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in EXTS]

def collect_pool_by_class(input_root: Path):
    """
    Collect ALL images from input_root/{train,val,test}/<class>/...
    Returns dict: class_name -> [Path, ...]
    """
    pool = defaultdict(list)
    for split_dir in splits_in:
        if not split_dir.exists(): 
            continue
        for cls_dir in [d for d in split_dir.iterdir() if d.is_dir()]:
            imgs = list_images(cls_dir)
            if imgs:
                pool[cls_dir.name].extend(imgs)
    return pool

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def safe_copy(src: Path, dst: Path):
    """
    Copy src -> dst. If a file with same name exists, append numeric suffix to avoid overwrite.
    """
    ensure_dir(dst.parent)
    target = dst
    if target.exists():
        stem, suf = target.stem, target.suffix
        k = 1
        while target.exists():
            target = target.with_name(f"{stem}_{k}{suf}")
            k += 1
    shutil.copy2(src, target)

def copy_many(pairs):
    """
    Copy (src, dst) pairs using threads for I/O speed.
    """
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = [ex.submit(safe_copy, s, d) for s, d in pairs]
        for i, _ in enumerate(as_completed(futs), 1):
            if i % 200 == 0:
                print(f"  copied {i}/{len(pairs)} files…")

# --------- Main procedure ----------
if not in_root.exists():
    raise FileNotFoundError(
        f"INPUT_ROOT not found: {in_root}\n"
        f"If this is OneDrive, right‑click the folder in Explorer → 'Always keep on this device'."
    )

if out_root.exists() and CLEAN_OUTPUT:
    print(f"⚠️ Removing existing output: {out_root}")
    shutil.rmtree(out_root)

# Create output dirs
ensure_dir(train_out); ensure_dir(val_out); ensure_dir(test_out)

# Build pool
pool = collect_pool_by_class(in_root)
if not pool:
    raise RuntimeError(f"No class folders or images found under {in_root}/(train|val|test).")

# Split & prepare copy lists
summary = {}
copy_list = []  # list of (src, dst) to copy

print("Preparing stratified 70/15/15 split per class...")
for cls, files in pool.items():
    if not files:
        print(f"⚠️ No images for class '{cls}', skipping.")
        continue

    random.shuffle(files)
    n_total = len(files)
    n_train = int(round(n_total * TRAIN_RATIO))
    n_val   = int(round(n_total * VAL_RATIO))
    # ensure totals add up exactly
    n_test  = n_total - n_train - n_val

    train_files = files[:n_train]
    val_files   = files[n_train:n_train+n_val]
    test_files  = files[n_train+n_val:]

    # Save intended counts
    summary[cls] = {"total": n_total, "train": len(train_files), "val": len(val_files), "test": len(test_files)}

    # Build destination pairs
    for src in train_files:
        dst = train_out / cls / src.name
        copy_list.append((src, dst))
    for src in val_files:
        dst = val_out / cls / src.name
        copy_list.append((src, dst))
    for src in test_files:
        dst = test_out / cls / src.name
        copy_list.append((src, dst))

# Execute copies
total_to_copy = len(copy_list)
print(f"Copying {total_to_copy} files to:\n  {out_root}\nThis may take a while...")
copy_many(copy_list)

# Report
grand_total = sum(v["total"] for v in summary.values())
grand_train = sum(v["train"] for v in summary.values())
grand_val   = sum(v["val"]   for v in summary.values())
grand_test  = sum(v["test"]  for v in summary.values())

print("\n✅ New dataset created with stratified 70/15/15 split.")
print(f"Output root: {out_root}")
print(f" train/: {grand_train}")
print(f" val/  : {grand_val}")
print(f" test/ : {grand_test}")
print(f" total : {grand_total}\n")

print("Per-class breakdown:")
for cls in sorted(summary.keys()):
    s = summary[cls]
    print(f"  {cls:<10} total={s['total']:>6}  train={s['train']:>6}  val={s['val']:>6}  test={s['test']:>6}")


Preparing stratified 70/15/15 split per class...
Copying 84465 files to:
  C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_70_15_15
This may take a while...
  copied 200/84465 files…
  copied 400/84465 files…
  copied 600/84465 files…
  copied 800/84465 files…
  copied 1000/84465 files…
  copied 1200/84465 files…
  copied 1400/84465 files…
  copied 1600/84465 files…
  copied 1800/84465 files…
  copied 2000/84465 files…
  copied 2200/84465 files…
  copied 2400/84465 files…
  copied 2600/84465 files…
  copied 2800/84465 files…
  copied 3000/84465 files…
  copied 3200/84465 files…
  copied 3400/84465 files…
  copied 3600/84465 files…
  copied 3800/84465 files…
  copied 4000/84465 files…
  copied 4200/84465 files…
  copied 4400/84465 files…
  copied 4600/84465 files…
  copied 4800/84465 files…
  copied 5000/84465 files…
  copied 5200/84465 files…
  copied 5400/84465 files…
  copied 5600/84465 files…
  copied 5800/84465 files…
  copied 6000/84465 files…
  copi

# Preprocessing Step 3: Downscale to 128×128 (Efficiency Option)
We generate a compact **128×128** version of the 70/15/15 dataset for
lightweight models and ablations. We again use aspect-ratio-preserving
resize with black padding and preserve the split/class structure.

- Input: `OCT2017_70_15_15`
- Output: `OCT2017_128`


In [None]:
import os
from pathlib import Path
import cv2
from time import time
from collections import defaultdict

# =============== CONFIG ===============
INPUT_ROOT  = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_70_15_15"
OUTPUT_ROOT = r"C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_128"

TARGET_SIZE = (128, 128)         # (width, height)
EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}
OVERWRITE = False                 # skip files that already exist
FORCE_GRAYSCALE = False           # set True to save as single-channel grayscale
# =====================================

def resize_with_padding(img, target_size=(128,128)):
    """Resize keeping aspect ratio, then pad with black to target size."""
    h, w = img.shape[:2]
    tw, th = target_size
    scale = min(tw / w, th / h)
    nw, nh = int(w * scale), int(h * scale)
    resized = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)

    # compute padding
    dw, dh = tw - nw, th - nh
    top, bottom = dh // 2, dh - dh // 2
    left, right = dw // 2, dw - dw // 2

    # pad (preserve channels)
    if resized.ndim == 2:
        border_val = 0
    else:
        border_val = (0, 0, 0)
    out = cv2.copyMakeBorder(resized, top, bottom, left, right,
                             borderType=cv2.BORDER_CONSTANT, value=border_val)
    return out

def iter_images(root, exts):
    root = Path(root)
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            yield p

def main():
    in_root  = Path(INPUT_ROOT)
    out_root = Path(OUTPUT_ROOT)
    out_root.mkdir(parents=True, exist_ok=True)

    if not in_root.exists():
        print("❌ INPUT_ROOT does not exist:", in_root)
        print("Tip: If this is OneDrive, right-click the folder in Explorer → 'Always keep on this device'.")
        return

    files = list(iter_images(in_root, EXTS))
    if not files:
        print("❌ No images found under:", in_root)
        return

    print(f"Found {len(files)} images. Resizing to {TARGET_SIZE}…")
    t0 = time()
    processed = skipped = errors = 0

    # simple per split/class counts
    buckets = defaultdict(int)

    def progress(i, n, bar_len=30):
        frac = (i + 1) / n
        filled = int(bar_len * frac)
        bar = "█" * filled + "·" * (bar_len - filled)
        print(f"\r[{bar}] {i+1}/{n}", end="", flush=True)

    for i, src in enumerate(files):
        rel = src.relative_to(in_root)
        dst = out_root / rel
        dst.parent.mkdir(parents=True, exist_ok=True)

        # bucket key like "train/CNV"
        parts = rel.parts
        if len(parts) >= 2 and parts[0].lower() in {"train","val","test"}:
            bucket = f"{parts[0]}/{parts[1]}"
        else:
            bucket = "other"
        buckets[bucket] += 1

        if dst.exists() and not OVERWRITE:
            skipped += 1
            progress(i, len(files))
            continue

        img = cv2.imread(str(src), cv2.IMREAD_UNCHANGED)
        if img is None:
            errors += 1
            progress(i, len(files))
            continue

        try:
            if FORCE_GRAYSCALE:
                if img.ndim == 3:
                    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            out = resize_with_padding(img, TARGET_SIZE)
            # ensure type for saving
            if out.dtype != 'uint8':
                out = out.clip(0, 255).astype('uint8')
            cv2.imwrite(str(dst), out)
            processed += 1
        except Exception:
            errors += 1
        finally:
            progress(i, len(files))

    dt = time() - t0
    print("\n\n✅ Done.")
    print(f"Processed : {processed}")
    print(f"Skipped   : {skipped} (exists, OVERWRITE={OVERWRITE})")
    print(f"Errors    : {errors}")
    print(f"Output to : {out_root}")
    print(f"Elapsed   : {dt:.1f}s  (~{dt / max(1, processed):.3f}s per image)\n")

    print("Per split/class counts (input seen):")
    for k in sorted(buckets.keys()):
        print(f"  {k:<20} {buckets[k]:>6}")

if __name__ == "__main__":
    main()


Found 84465 images. Resizing to (128, 128)…
[██████████████████████████████] 84465/84465

✅ Done.
Processed : 84465
Skipped   : 0 (exists, OVERWRITE=False)
Errors    : 0
Output to : C:\Users\sheno\OneDrive\CODCSD201F-006-SetupFile\Desktop\FINAL\dataset\OCT2017_128
Elapsed   : 841.3s  (~0.010s per image)

Per split/class counts (input seen):
  test/CNV               5617
  test/DME               1739
  test/DRUSEN            1329
  test/NORMAL            3984
  train/CNV             26216
  train/DME              8116
  train/DRUSEN           6201
  train/NORMAL          18593
  val/CNV                5618
  val/DME                1739
  val/DRUSEN             1329
  val/NORMAL             3984
