# MIT 805: Assignment part 2
## Thabo Chesane
## u20507102

### Packages required

In [2]:
# %pip install pillow numpy pandas scikit-learn plotly tqdm


### imports & config

In [3]:
from pathlib import Path
import os, io, math, json, itertools
from collections import defaultdict, Counter
from dataclasses import dataclass
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import plotly.express as px
import plotly.graph_objects as go

### Check folders exist

In [None]:
DATA_ROOT = Path("skin-lesions")   


assert (DATA_ROOT / "train").exists(), "Missing 'train' folder under DATA_ROOT"
assert (DATA_ROOT / "valid").exists(), "Missing 'valid' folder under DATA_ROOT"
assert (DATA_ROOT / "test").exists(),  "Missing 'test' folder under DATA_ROOT"

In [5]:
CLASSES = sorted([p.name for p in (DATA_ROOT/"train").iterdir() if p.is_dir()])
CLASSES

['melanoma', 'nevus', 'seborrheic_keratosis']

In [6]:
def list_images_with_labels(split: str) -> List[Tuple[Path, str]]:
    base = DATA_ROOT / split
    items = []
    for cls in CLASSES:
        for p in (base / cls).glob("*.jpg"):
            items.append((p, cls))
        for p in (base / cls).glob("*.png"):
            items.append((p, cls))
        for p in (base / cls).glob("*.jpeg"):
            items.append((p, cls))
    return items

train_files = list_images_with_labels("train")
valid_files = list_images_with_labels("valid")
test_files  = list_images_with_labels("test")

len(train_files), len(valid_files), len(test_files)

(2000, 150, 600)

In [7]:
def split_counts(files):
    c = Counter([lbl for _, lbl in files])
    return pd.DataFrame({"label": list(c.keys()), "count": list(c.values())}).sort_values("count", ascending=False)

split_summary = {
    "train": split_counts(train_files),
    "valid": split_counts(valid_files),
    "test":  split_counts(test_files)
}
split_summary["train"]

Unnamed: 0,label,count
1,nevus,1372
0,melanoma,374
2,seborrheic_keratosis,254


In [None]:
def _image_to_array(path: Path, target_size=(256, 256)):
    
    with Image.open(path) as im:
        im = im.convert("RGB").resize(target_size, Image.BILINEAR)
        return np.asarray(im, dtype=np.uint8)

def _gradient_magnitude(gray: np.ndarray) -> np.ndarray:
    
    gx = np.array([[-1,0,1],[-2,0,2],[-1,0,1]], dtype=np.int32)
    gy = gx.T
    
    g = gray.astype(np.int32)
    pad = np.pad(g, ((1,1),(1,1)), mode="edge")

    mx = np.zeros_like(g, dtype=np.float32)
    my = np.zeros_like(g, dtype=np.float32)
    for i in range(g.shape[0]):
        for j in range(g.shape[1]):
            roi = pad[i:i+3, j:j+3]
            mx[i,j] = (roi * gx).sum()
            my[i,j] = (roi * gy).sum()
    mag = np.sqrt(mx*mx + my*my)
    
    mag = mag / (mag.max() + 1e-9) * 255.0
    return mag.astype(np.float32)

def extract_features(img_rgb: np.ndarray) -> np.ndarray:
    
    feat = []
    for ch in range(3):
        h, _ = np.histogram(img_rgb[..., ch], bins=32, range=(0,256), density=True)
        feat.append(h.astype(np.float32))
    
    gray = (0.299*img_rgb[...,0] + 0.587*img_rgb[...,1] + 0.114*img_rgb[...,2]).astype(np.uint8)
    mag = _gradient_magnitude(gray)
    th, _ = np.histogram(mag, bins=32, range=(0,255), density=True)
    feat.append(th.astype(np.float32))
    return np.concatenate(feat)  

def map_image_to_kv(path: Path, label: str) -> Tuple[str, np.ndarray]:
    arr = _image_to_array(path)
    feat = extract_features(arr)
    return (label, feat)

### MapReduce Implementation

#### Map

In [9]:
SAMPLE = None  

def map_split(files: List[Tuple[Path, str]], sample=SAMPLE):
    if sample is not None and len(files) > sample:
        files = list(itertools.islice(files, sample))
    out = []
    for p, lbl in tqdm(files, desc="Mapping images"):
        try:
            out.append(map_image_to_kv(p, lbl))
        except Exception as e:
            # skip unreadable/corrupt files
            continue
    return out

mapped_train = map_split(train_files)
mapped_valid = map_split(valid_files)
mapped_test  = map_split(test_files)

len(mapped_train), len(mapped_valid), len(mapped_test)

Mapping images: 100%|██████████| 2000/2000 [09:13<00:00,  3.61it/s]
Mapping images: 100%|██████████| 150/150 [00:48<00:00,  3.11it/s]
Mapping images: 100%|██████████| 600/600 [03:39<00:00,  2.73it/s]


(2000, 150, 600)

#### Shuffle

In [10]:
def shuffle_group_by_label(mapped_records: List[Tuple[str, np.ndarray]]) -> Dict[str, List[np.ndarray]]:
    grouped = defaultdict(list)
    for lbl, feat in mapped_records:
        grouped[lbl].append(feat)
    return grouped

grouped_train = shuffle_group_by_label(mapped_train)
{k: len(v) for k, v in grouped_train.items()}


{'melanoma': 374, 'nevus': 1372, 'seborrheic_keratosis': 254}

#### Reduce

In [None]:
def reduce_class_stats(grouped: Dict[str, List[np.ndarray]]):
    stats = {}
    for lbl, feats in grouped.items():
        M = np.stack(feats, axis=0).astype(np.float64)
        cnt = M.shape[0]
        mean = M.mean(axis=0)
        var  = M.var(axis=0)
        stats[lbl] = {"count": int(cnt), "mean": mean.astype(np.float32), "var": var.astype(np.float32)}
    return stats

class_stats = reduce_class_stats(grouped_train)
{lbl: v["count"] for lbl, v in class_stats.items()}

{'melanoma': 374, 'nevus': 1372, 'seborrheic_keratosis': 254}

In [12]:
def records_to_dataframe(mapped_records: List[Tuple[str, np.ndarray]]) -> pd.DataFrame:
    labels = [lbl for lbl, _ in mapped_records]
    feats  = [feat for _, feat in mapped_records]
    X = np.vstack(feats)
    df = pd.DataFrame(X, columns=[f"f{i:03d}" for i in range(X.shape[1])])
    df["label"] = labels
    return df

df_train = records_to_dataframe(mapped_train)
df_valid = records_to_dataframe(mapped_valid)
df_test  = records_to_dataframe(mapped_test)

df_train.shape, df_valid.shape, df_test.shape

((2000, 129), (150, 129), (600, 129))

### Visualisations

In [13]:
def plot_class_distribution(df: pd.DataFrame, title: str):
    counts = df["label"].value_counts().reset_index()
    counts.columns = ["label", "count"]
    fig = px.bar(counts, x="label", y="count", title=title, text="count")
    fig.update_traces(textposition="outside")
    fig.update_layout(yaxis_title="Images", xaxis_title="Class", uniformtext_minsize=12, uniformtext_mode='hide')
    fig.show()

plot_class_distribution(df_train, "Class Distribution — Train")
plot_class_distribution(df_valid, "Class Distribution — Valid")
plot_class_distribution(df_test,  "Class Distribution — Test")





This means that static image generation (e.g. `fig.write_image()`) will not work.

Please upgrade Plotly to version 6.1.1 or greater, or downgrade Kaleido to version 0.2.1.




In [None]:
def plot_pca(df: pd.DataFrame, title: str, max_points=3000):
    
    if len(df) > max_points:
        df = df.sample(max_points, random_state=42)

    X = df.drop(columns=["label"]).to_numpy(dtype=np.float32)
    y = df["label"].to_numpy()

    X = StandardScaler().fit_transform(X)
    X2 = PCA(n_components=2, random_state=42).fit_transform(X)
    out = pd.DataFrame({"pc1": X2[:,0], "pc2": X2[:,1], "label": y})

    fig = px.scatter(out, x="pc1", y="pc2", color="label", title=title, opacity=0.8)
    fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")
    fig.show()

plot_pca(df_train, "PCA of Image Features — Train (sampled)")

In [15]:
def train_evaluate_baseline(df_train: pd.DataFrame, df_val: pd.DataFrame):
    X_train = df_train.drop(columns=["label"]).to_numpy(dtype=np.float32)
    y_train = df_train["label"].to_numpy()
    X_val   = df_val.drop(columns=["label"]).to_numpy(dtype=np.float32)
    y_val   = df_val["label"].to_numpy()

    scaler = StandardScaler().fit(X_train)
    X_train_s = scaler.transform(X_train)
    X_val_s   = scaler.transform(X_val)

    clf = LogisticRegression(max_iter=300, n_jobs=-1)
    clf.fit(X_train_s, y_train)
    y_pred = clf.predict(X_val_s)
    report = classification_report(y_val, y_pred, output_dict=True)
    cm = confusion_matrix(y_val, y_pred, labels=clf.classes_)
    return clf, scaler, report, cm, clf.classes_

clf, scaler, report, cm, classes = train_evaluate_baseline(df_train, df_valid)
pd.DataFrame(report).transpose().round(3)

Unnamed: 0,precision,recall,f1-score,support
melanoma,0.5,0.133,0.211,30.0
nevus,0.546,0.91,0.683,78.0
seborrheic_keratosis,0.333,0.095,0.148,42.0
accuracy,0.527,0.527,0.527,0.527
macro avg,0.46,0.38,0.347,150.0
weighted avg,0.477,0.527,0.439,150.0


In [16]:
def plot_confusion_matrix(cm: np.ndarray, labels: List[str], title: str):
    z = cm.astype(int)
    fig = go.Figure(data=go.Heatmap(
        z=z, x=labels, y=labels, colorscale="Viridis", hovertemplate="Pred: %{x}<br>True: %{y}<br>Count: %{z}<extra></extra>"
    ))
    fig.update_layout(title=title, xaxis_title="Predicted", yaxis_title="True")
    fig.show()

plot_confusion_matrix(cm, list(classes), "Confusion Matrix — Logistic Regression (Valid)")


In [17]:
def evaluate_on_test(df_test: pd.DataFrame, clf, scaler):
    X = df_test.drop(columns=["label"]).to_numpy(dtype=np.float32)
    y = df_test["label"].to_numpy()
    Xs = scaler.transform(X)
    y_pred = clf.predict(Xs)
    rep = classification_report(y, y_pred, output_dict=True)
    cm_test = confusion_matrix(y, y_pred, labels=clf.classes_)
    return rep, cm_test

rep_test, cm_test = evaluate_on_test(df_test, clf, scaler)
pd.DataFrame(rep_test).transpose().round(3)

Unnamed: 0,precision,recall,f1-score,support
melanoma,0.268,0.094,0.139,117.0
nevus,0.651,0.827,0.729,393.0
seborrheic_keratosis,0.133,0.089,0.107,90.0
accuracy,0.573,0.573,0.573,0.573
macro avg,0.351,0.337,0.325,600.0
weighted avg,0.499,0.573,0.52,600.0


In [None]:

import torch
from torch import nn
from torchvision import models, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


mv2 = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
mv2.eval()

for p in mv2.parameters():
    p.requires_grad = False


feature_extractor = nn.Sequential(
    mv2.features,                
    nn.AdaptiveAvgPool2d((1,1)),  
    nn.Flatten()                  
).to(device)


img_transforms = models.MobileNet_V2_Weights.IMAGENET1K_V1.transforms()


Using device: cpu
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /Users/thabochesane/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth


100%|██████████| 13.6M/13.6M [00:03<00:00, 4.44MB/s]


### MapReduce using CNN

In [20]:
from PIL import Image
import numpy as np

@torch.no_grad()
def image_path_to_embedding(path: Path) -> np.ndarray:
    """Load a single image path → 1280-dim MobileNetV2 embedding (numpy.float32)."""
    with Image.open(path) as im:
        im = im.convert("RGB")
    x = img_transforms(im).unsqueeze(0).to(device) 
    feat = feature_extractor(x)                    
    return feat.squeeze(0).cpu().numpy().astype("float32")

def map_image_to_kv_cnn(path: Path, label: str) -> Tuple[str, np.ndarray]:
    """MAP: returns (label, embedding) just like the histogram version."""
    emb = image_path_to_embedding(path)
    return (label, emb)

#### Map

In [21]:
from tqdm import tqdm
import itertools

SAMPLE_CNN = None  

def map_split_cnn(files: List[Tuple[Path, str]], sample=SAMPLE_CNN):
    if sample is not None and len(files) > sample:
        files = list(itertools.islice(files, sample))
    out = []
    for p, lbl in tqdm(files, desc="Mapping images (CNN)"):
        try:
            out.append(map_image_to_kv_cnn(p, lbl))
        except Exception:
            # ignore unreadable/corrupt image
            continue
    return out

mapped_train_cnn = map_split_cnn(train_files)
mapped_valid_cnn = map_split_cnn(valid_files)
mapped_test_cnn  = map_split_cnn(test_files)

len(mapped_train_cnn), len(mapped_valid_cnn), len(mapped_test_cnn)

Mapping images (CNN): 100%|██████████| 2000/2000 [03:16<00:00, 10.18it/s]
Mapping images (CNN): 100%|██████████| 150/150 [00:21<00:00,  7.00it/s]
Mapping images (CNN): 100%|██████████| 600/600 [01:56<00:00,  5.14it/s]


(2000, 150, 600)

#### Shuffle & Reduce

In [22]:
grouped_train_cnn = shuffle_group_by_label(mapped_train_cnn)
class_stats_cnn = reduce_class_stats(grouped_train_cnn)


{k: v["count"] for k, v in class_stats_cnn.items()}

{'melanoma': 374, 'nevus': 1372, 'seborrheic_keratosis': 254}

In [23]:
df_train_cnn = records_to_dataframe(mapped_train_cnn)
df_valid_cnn = records_to_dataframe(mapped_valid_cnn)
df_test_cnn  = records_to_dataframe(mapped_test_cnn)

df_train_cnn.shape, df_valid_cnn.shape, df_test_cnn.shape

((2000, 1281), (150, 1281), (600, 1281))

In [24]:
plot_class_distribution(df_train_cnn, "Class Distribution — Train (CNN features)")
plot_class_distribution(df_valid_cnn, "Class Distribution — Valid (CNN features)")
plot_class_distribution(df_test_cnn,  "Class Distribution — Test (CNN features)")

plot_pca(df_train_cnn, "PCA of CNN Embeddings — Train (sampled)")

In [25]:
clf_cnn, scaler_cnn, report_cnn, cm_cnn, classes_cnn = train_evaluate_baseline(df_train_cnn, df_valid_cnn)
pd.DataFrame(report_cnn).transpose().round(3)

Unnamed: 0,precision,recall,f1-score,support
melanoma,0.519,0.467,0.491,30.0
nevus,0.708,0.808,0.754,78.0
seborrheic_keratosis,0.676,0.548,0.605,42.0
accuracy,0.667,0.667,0.667,0.667
macro avg,0.634,0.607,0.617,150.0
weighted avg,0.661,0.667,0.66,150.0


In [26]:
plot_confusion_matrix(cm_cnn, list(classes_cnn), "Confusion Matrix — Logistic Regression on CNN Embeddings (Valid)")

In [28]:
rep_test_cnn, cm_test_cnn = evaluate_on_test(df_test_cnn, clf_cnn, scaler_cnn)
pd.DataFrame(rep_test_cnn).transpose().round(3)

Unnamed: 0,precision,recall,f1-score,support
melanoma,0.348,0.393,0.369,117.0
nevus,0.777,0.728,0.752,393.0
seborrheic_keratosis,0.4,0.444,0.421,90.0
accuracy,0.62,0.62,0.62,0.62
macro avg,0.509,0.522,0.514,600.0
weighted avg,0.637,0.62,0.628,600.0


### Fine-tuned MobileNetV model

In [29]:
import pandas as pd
from collections import Counter
import numpy as np

def files_to_df(pairs):
    return pd.DataFrame({"path": [str(p) for p,_ in pairs], "label": [lbl for _,lbl in pairs]})

df_tr_paths = files_to_df(train_files)
df_va_paths = files_to_df(valid_files)
df_te_paths = files_to_df(test_files)

label_to_idx = {c:i for i,c in enumerate(CLASSES)}
y_tr_indices = np.array([label_to_idx[l] for l in df_tr_paths["label"]])

counts = Counter(y_tr_indices)
num_classes = len(CLASSES)
class_counts = np.array([counts[i] for i in range(num_classes)], dtype=np.float32)
class_weights = (class_counts.sum() / (num_classes * class_counts))
class_weights

array([1.7825311 , 0.48590866, 2.624672  ], dtype=float32)

In [None]:


import os, warnings
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from PIL import Image, ImageFile
from torchvision import transforms


ImageFile.LOAD_TRUNCATED_IMAGES = True


try:
    import torch.multiprocessing as mp
    mp.set_start_method("spawn", force=False)  
except RuntimeError:
    pass  


train_tfms = transforms.Compose([
    transforms.Resize(320),
    transforms.RandomResizedCrop(288, scale=(0.8, 1.0), ratio=(0.9, 1.1)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.10, hue=0.02),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

eval_tfms = transforms.Compose([
    transforms.Resize(320),
    transforms.CenterCrop(288),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])


class LesionDatasetSafe(Dataset):
    def __init__(self, df, tfms, label_to_idx, skip_corrupt=True):
        self.df = df.reset_index(drop=True)
        self.tfms = tfms
        self.label_to_idx = label_to_idx
        self.skip_corrupt = skip_corrupt

    def __len__(self): 
        return len(self.df)

    def __getitem__(self, i):
        path = self.df.loc[i, "path"]
        label = self.df.loc[i, "label"]
        try:
            with Image.open(path) as im:
                im = im.convert("RGB")
            x = self.tfms(im)
            y = self.label_to_idx[label]
            return x, y
        except Exception as e:
            if self.skip_corrupt:
                
                return None
            else:
                raise


def drop_none_collate(batch):
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        
        return None
    xs, ys = zip(*batch)
    return torch.stack(xs, 0), torch.tensor(ys, dtype=torch.long)


ds_tr = LesionDatasetSafe(df_tr_paths, train_tfms, label_to_idx, skip_corrupt=True)
ds_va = LesionDatasetSafe(df_va_paths, eval_tfms,  label_to_idx, skip_corrupt=True)
ds_te = LesionDatasetSafe(df_te_paths, eval_tfms,  label_to_idx, skip_corrupt=True)


sample_weights = np.array([class_weights[label_to_idx[l]] for l in df_tr_paths["label"]], dtype=np.float32)
sampler = WeightedRandomSampler(
    weights=torch.tensor(sample_weights), 
    num_samples=len(sample_weights), 
    replacement=True
)


BATCH_SIZE = 32
NUM_WORKERS = 0 
PIN_MEMORY = torch.cuda.is_available()

dl_tr = DataLoader(
    ds_tr, batch_size=BATCH_SIZE, sampler=sampler, 
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
    persistent_workers=False, prefetch_factor=None,  
    collate_fn=drop_none_collate
)

dl_va = DataLoader(
    ds_va, batch_size=BATCH_SIZE, shuffle=False, 
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
    persistent_workers=False, prefetch_factor=None,
    collate_fn=drop_none_collate
)

dl_te = DataLoader(
    ds_te, batch_size=BATCH_SIZE, shuffle=False, 
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
    persistent_workers=False, prefetch_factor=None,
    collate_fn=drop_none_collate
)

len(ds_tr), len(ds_va), len(ds_te)


(2000, 150, 600)

In [None]:

from torchvision import models
from torch import nn
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

m = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
in_feats = m.classifier[1].in_features
m.classifier[1] = nn.Linear(in_feats, len(CLASSES))


for p in m.features.parameters():
    p.requires_grad = False
for p in m.features[-2:].parameters():
    p.requires_grad = True

m = m.to(device)
print("Trainable params:", sum(p.requires_grad for p in m.parameters()))


Using device: cpu
Trainable params: 14


In [None]:

from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.nn as nn
import numpy as np
import torch

use_focal = False  

class WeightedFocalLoss(nn.Module):
    def __init__(self, alpha, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha = torch.tensor(alpha, dtype=torch.float32)
        self.gamma = gamma
        self.reduction = reduction
    def forward(self, logits, targets):
        ce = nn.functional.cross_entropy(
            logits, targets, weight=self.alpha.to(logits.device), reduction="none"
        )
        pt = torch.exp(-ce)
        loss = ((1-pt)**self.gamma) * ce
        return loss.mean() if self.reduction == "mean" else loss.sum()

alpha = class_weights  
criterion = (WeightedFocalLoss(alpha=alpha, gamma=2.0)
             if use_focal
             else nn.CrossEntropyLoss(weight=torch.tensor(alpha, dtype=torch.float32).to(device)))

optimizer = AdamW(filter(lambda p: p.requires_grad, m.parameters()), lr=3e-4, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=10)


In [None]:

from sklearn.metrics import roc_auc_score
import numpy as np
import torch

def evaluate(model, dl):
    model.eval()
    all_logits, all_targets = [], []
    with torch.no_grad():
        for batch in dl:
            if batch is None:  
                continue
            xb, yb = batch
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            all_logits.append(logits.cpu().numpy())
            all_targets.append(yb.cpu().numpy())
    if not all_logits:  
        return np.nan, np.nan, np.array([]), np.array([]), np.array([[]])

    logits = np.concatenate(all_logits, axis=0)
    targets = np.concatenate(all_targets, axis=0)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    try:
        auroc = roc_auc_score(targets, probs, multi_class="ovr", average="macro")
    except ValueError:
        auroc = np.nan
    preds = probs.argmax(1)
    acc = (preds == targets).mean() if len(targets) else np.nan
    return acc, auroc, preds, targets, probs

best_auroc, best_state = -1, None
EPOCHS = 15
patience, bad = 5, 0

for epoch in range(1, EPOCHS+1):
    m.train()
    running = 0.0
    seen = 0
    for batch in dl_tr:
        if batch is None:
            continue
        xb, yb = batch
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = m(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running += loss.item() * xb.size(0)
        seen += xb.size(0)

    scheduler.step()
    tr_loss = running / max(seen, 1)

    va_acc, va_auroc, _, _, _ = evaluate(m, dl_va)
    print(f"Epoch {epoch:02d} | train loss {tr_loss:.4f} | val acc {va_acc:.3f} | val AUROC {va_auroc:.3f}")

    if np.isnan(va_auroc):
        continue
    if va_auroc > best_auroc:
        best_auroc = va_auroc
        best_state = {k: v.cpu() for k, v in m.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= patience:
            print("Early stopping.")
            break

if best_state is not None:
    m.load_state_dict({k: v.to(device) for k, v in best_state.items()})


Epoch 01 | train loss 0.5678 | val acc 0.520 | val AUROC 0.816
Epoch 02 | train loss 0.4359 | val acc 0.487 | val AUROC 0.829
Epoch 03 | train loss 0.3344 | val acc 0.567 | val AUROC 0.832
Epoch 04 | train loss 0.2958 | val acc 0.680 | val AUROC 0.860
Epoch 05 | train loss 0.2658 | val acc 0.667 | val AUROC 0.843
Epoch 06 | train loss 0.2297 | val acc 0.653 | val AUROC 0.863
Epoch 07 | train loss 0.1973 | val acc 0.687 | val AUROC 0.867
Epoch 08 | train loss 0.1940 | val acc 0.687 | val AUROC 0.868
Epoch 09 | train loss 0.1849 | val acc 0.713 | val AUROC 0.876
Epoch 10 | train loss 0.1578 | val acc 0.713 | val AUROC 0.874
Epoch 11 | train loss 0.1841 | val acc 0.693 | val AUROC 0.877
Epoch 12 | train loss 0.1684 | val acc 0.707 | val AUROC 0.871
Epoch 13 | train loss 0.1658 | val acc 0.707 | val AUROC 0.877
Epoch 14 | train loss 0.1835 | val acc 0.740 | val AUROC 0.881
Epoch 15 | train loss 0.1724 | val acc 0.673 | val AUROC 0.864


In [36]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

va_acc, va_auroc, va_preds, va_targets, va_probs = evaluate(m, dl_va)
print(f"Validation ACC: {va_acc:.3f} | AUROC (macro): {va_auroc:.3f}")
if va_preds.size > 0:
    print(pd.DataFrame(classification_report(va_targets, va_preds, target_names=CLASSES, output_dict=True)).transpose().round(3))
    cm_va = confusion_matrix(va_targets, va_preds, labels=list(range(len(CLASSES))))
    plot_confusion_matrix(cm_va, CLASSES, "Confusion Matrix — Fine-tuned MobileNetV2 (Valid)")
else:
    print("No validation predictions available (all batches may have been dropped).")


Validation ACC: 0.673 | AUROC (macro): 0.864
                      precision  recall  f1-score  support
melanoma                  0.500   0.600     0.545   30.000
nevus                     0.917   0.564     0.698   78.000
seborrheic_keratosis      0.591   0.929     0.722   42.000
accuracy                  0.673   0.673     0.673    0.673
macro avg                 0.669   0.698     0.655  150.000
weighted avg              0.742   0.673     0.674  150.000


In [40]:

from sklearn.metrics import classification_report
import numpy as np

te_acc, te_auroc, te_preds, te_targets, te_probs = evaluate(m, dl_te)
print(f"Test ACC: {te_acc:.3f} | AUROC (macro): {te_auroc:.3f}")
if te_preds.size > 0:
    print(pd.DataFrame(classification_report(te_targets, te_preds, target_names=CLASSES, output_dict=True)).transpose().round(3))
    cm_te = confusion_matrix(te_targets, te_preds, labels=list(range(len(CLASSES))))
    plot_confusion_matrix(cm_te, CLASSES, "Confusion Matrix — Fine-tuned MobileNetV2 (Test)")

   
    if "melanoma" in CLASSES:
        mel_idx = CLASSES.index("melanoma")
        thr = 0.35 
        te_preds_thr = (te_probs[:, mel_idx] > thr).astype(int)
        y_true_mel = (np.array(te_targets) == mel_idx).astype(int)
        print("\nBinary melanoma sensitivity @ thr=0.35:")
        print(classification_report(y_true_mel, te_preds_thr, target_names=["non-mel", "mel"], digits=3))
else:
    print("No test predictions available (all batches may have been dropped).")


Test ACC: 0.528 | AUROC (macro): 0.819
                      precision  recall  f1-score  support
melanoma                  0.406   0.590     0.481  117.000
nevus                     0.933   0.427     0.586  393.000
seborrheic_keratosis      0.320   0.889     0.471   90.000
accuracy                  0.528   0.528     0.528    0.528
macro avg                 0.553   0.635     0.513  600.000
weighted avg              0.738   0.528     0.548  600.000



Binary melanoma sensitivity @ thr=0.35:
              precision    recall  f1-score   support

     non-mel      0.895     0.741     0.811       483
         mel      0.375     0.641     0.473       117

    accuracy                          0.722       600
   macro avg      0.635     0.691     0.642       600
weighted avg      0.794     0.722     0.745       600



In [None]:

import numpy as np
import torch

def tta_predict(model, dl, n=4):
    model.eval()
    all_logits, all_targets = [], []
    with torch.no_grad():
        for batch in dl:
            if batch is None:
                continue
            xb, yb = batch
            xb, yb = xb.to(device), yb.to(device)
            logits_sum = 0
            for _ in range(n):
                
                xb_aug = torch.flip(xb, dims=[3]) if np.random.rand() < 0.5 else xb
                logits_sum += model(xb_aug)
            all_logits.append((logits_sum / n).cpu().numpy())
            all_targets.append(yb.cpu().numpy())
    if not all_logits:
        return np.array([[]]), np.array([])
    return np.concatenate(all_logits), np.concatenate(all_targets)

logits_tta, tgt_tta = tta_predict(m, dl_te, n=4)
if logits_tta.size:
    probs_tta = torch.softmax(torch.tensor(logits_tta), dim=1).numpy()
    preds_tta = probs_tta.argmax(1)
    print(pd.DataFrame(classification_report(tgt_tta, preds_tta, target_names=CLASSES, output_dict=True)).transpose().round(3))
else:
    print("TTA produced no outputs (all test batches dropped).")


                      precision  recall  f1-score  support
melanoma                  0.410   0.607     0.490  117.000
nevus                     0.946   0.445     0.606  393.000
seborrheic_keratosis      0.331   0.889     0.482   90.000
accuracy                  0.543   0.543     0.543    0.543
macro avg                 0.562   0.647     0.526  600.000
weighted avg              0.749   0.543     0.564  600.000
