# Checkpoint Evaluation
Load saved checkpoints and evaluate accuracy on the test set.
(Noise testing will be added later.)

In [None]:
import os
from pathlib import Path
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from PIL import Image
import joblib
import cv2
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [None]:
# Paths and config
ROOT_DIR = "dataset/fruit360"
TEST_DIR = os.path.join(ROOT_DIR, "Test")
CKPT_ROOT = Path("artifacts/checkpoints")

SIZE = 32
BATCH = 128
COLOR_BINS = 16
VARIETY = False  # False = macro, True = fine-grained

In [None]:
class FruitFolderDataset(Dataset):
    def __init__(self, root_dir, transform=None, variety=False):
        self.root_dir = root_dir
        self.transform = transform
        self.variety = variety
        self.samples = []

        for class_name in sorted(os.listdir(root_dir)):
            class_dir = os.path.join(root_dir, class_name)
            if not os.path.isdir(class_dir):
                continue
            label_str = class_name if variety else class_name.split()[0]
            for img_name in os.listdir(class_dir):
                if img_name.lower().endswith((".jpg", ".png")):
                    self.samples.append((os.path.join(class_dir, img_name), label_str))

        self.labels = sorted({lbl for _, lbl in self.samples})
        self.label_to_idx = {lbl: i for i, lbl in enumerate(self.labels)}
        self.idx_to_label = {i: lbl for lbl, i in self.label_to_idx.items()}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label_str = self.samples[idx]
        image = Image.open(img_path).convert("RGB")
        img = self.transform(image) if self.transform is not None else image
        label_idx = self.label_to_idx[label_str]
        return img, label_idx

def dataloader_to_numpy(loader):
    x_list, y_list = [], []
    for batch_x, batch_y in loader:
        if isinstance(batch_x, torch.Tensor):
            x_list.append(batch_x.detach().cpu())
        else:
            x_list.append(torch.tensor(batch_x))
        if isinstance(batch_y, torch.Tensor):
            y_list.append(batch_y.detach().cpu())
        else:
            y_list.append(torch.tensor(batch_y))
    if not x_list:
        raise ValueError("Empty loader: no samples found.")
    X = torch.cat(x_list, dim=0).numpy()
    y = torch.cat(y_list, dim=0).numpy()
    return X, y

# Noise

In [None]:
def clamp01(x):
    return torch.clamp(x, 0.0, 1.0)

# Config per le trasformazioni
NOISE_CONFIG = {
    "grayscale": {},
    "blur_mild": {"kernel_size": 3, "sigma": 0.5},
    "blur_medium": {"kernel_size": 5, "sigma": 1.0},
    "blur_strong": {"kernel_size": 7, "sigma": 2.0},
    "noise_mild": {"std": 0.03},
    "noise_medium": {"std": 0.07},
    "noise_strong": {"std": 0.15},
    "dark": {"factor": 0.4},
    "overexposed": {"factor": 1.8},
    "noisy_blurred": {"noise_std": 0.10, "kernel_size": 5, "sigma": 1.0}
}

# Probabilit√† per il test mixed
MIXED_PROB_MAP = {
    "grayscale": 0.20,
    "blur_mild": 0.25,
    "blur_medium": 0.10,
    "noise_mild": 0.25,
    "noise_medium": 0.15,
    "dark": 0.10,
    "overexposed": 0.10,
    "noisy_blurred": 0.05,
}

degradations_camera = {
    # Baseline
    "clean": lambda x: x,

    # Grayscale (test colore)
    "grayscale": lambda x: x.mean(dim=0, keepdim=True).repeat(3, 1, 1),

    # Blur (defocus / motion)
    "blur_mild":   T.GaussianBlur(kernel_size=NOISE_CONFIG["blur_mild"]["kernel_size"], sigma=NOISE_CONFIG["blur_mild"]["sigma"]),
    "blur_medium": T.GaussianBlur(kernel_size=NOISE_CONFIG["blur_medium"]["kernel_size"], sigma=NOISE_CONFIG["blur_medium"]["sigma"]),
    "blur_strong": T.GaussianBlur(kernel_size=NOISE_CONFIG["blur_strong"]["kernel_size"], sigma=NOISE_CONFIG["blur_strong"]["sigma"]),

    # Gaussian noise (sensore)
    "noise_mild":   lambda x: clamp01(x + torch.randn_like(x) * NOISE_CONFIG["noise_mild"]["std"]),
    "noise_medium": lambda x: clamp01(x + torch.randn_like(x) * NOISE_CONFIG["noise_medium"]["std"]),
    "noise_strong": lambda x: clamp01(x + torch.randn_like(x) * NOISE_CONFIG["noise_strong"]["std"]),

    # Lighting
    "dark":        lambda x: clamp01(x * NOISE_CONFIG["dark"]["factor"]),
    "overexposed": lambda x: clamp01(x * NOISE_CONFIG["overexposed"]["factor"]),

    # Combined (noise + blur moderati)
    "noisy_blurred": lambda x: T.GaussianBlur(
        kernel_size=NOISE_CONFIG["noisy_blurred"]["kernel_size"],
        sigma=NOISE_CONFIG["noisy_blurred"]["sigma"]
    )(clamp01(x + torch.randn_like(x) * NOISE_CONFIG["noisy_blurred"]["noise_std"])),
}

def apply_degradation_batch(X_np, degradation_fn):
    X = torch.from_numpy(X_np)
    out = torch.empty_like(X)
    for i in range(X.shape[0]):
        x = X[i]
        y = degradation_fn(x) if callable(degradation_fn) else x
        out[i] = y
    return out.numpy()

def apply_mixed_degradations(X_np, prob_map, seed=0):
    rng = np.random.default_rng(seed)
    X = torch.from_numpy(X_np)
    out = torch.empty_like(X)
    keys = [k for k in prob_map.keys() if k in degradations_camera and k != "clean"]
    for i in range(X.shape[0]):
        x = X[i]
        for k in keys:
            if rng.random() < prob_map[k]:
                x = degradations_camera[k](x)
        out[i] = x
    return out.numpy()

In [None]:
val_transform = T.Compose([
    T.Resize((SIZE, SIZE)),
    T.ToTensor()
])

test_dataset = FruitFolderDataset(TEST_DIR, transform=val_transform, variety=VARIETY)
test_loader = DataLoader(test_dataset, batch_size=BATCH, shuffle=False)

X_test_np, y_test_np = dataloader_to_numpy(test_loader)
print("Test:", X_test_np.shape, y_test_np.shape)

In [None]:
def color_hist_features(X, bins=16, img_shape=(3, 64, 64)):
    n_samples = X.shape[0]
    feats = np.zeros((n_samples, 3 * bins), dtype=np.float32)
    bin_edges = np.linspace(0.0, 1.0, bins + 1)
    for i in range(n_samples):
        img = X[i].reshape(img_shape)
        img = np.transpose(img, (1, 2, 0))
        img = np.clip(img, 0.0, 1.0)
        img_hsv = (img * 255.0).astype(np.uint8)
        img_hsv = cv2.cvtColor(img_hsv, cv2.COLOR_RGB2HSV)
        h, s, v = cv2.split(img_hsv)
        hists = []
        for channel in (h, s, v):
            ch_norm = channel.astype(np.float32) / 255.0
            hist, _ = np.histogram(ch_norm.ravel(), bins=bin_edges, density=True)
            hists.append(hist)
        feats[i] = np.concatenate(hists)
    return feats

X_test_color = color_hist_features(X_test_np, bins=COLOR_BINS, img_shape=(3, SIZE, SIZE))

In [None]:
# Load checkpoints and evaluate
ckpt_model_paths = sorted(CKPT_ROOT.rglob("model.joblib"))
if not ckpt_model_paths:
    raise FileNotFoundError(f"No checkpoints found under {CKPT_ROOT}")

def infer_model_name(run_dir: Path):
    parts = run_dir.parts
    return parts[-2] if len(parts) >= 2 else "unknown"

def evaluate_checkpoints(X_np, y_np):
    X_color = color_hist_features(X_np, bins=COLOR_BINS, img_shape=(3, SIZE, SIZE))
    results = []
    for model_path in ckpt_model_paths:
        run_dir = model_path.parent
        scaler_path = run_dir / "scaler.joblib"
        model = joblib.load(model_path)
        scaler = joblib.load(scaler_path) if scaler_path.exists() else None

        X_eval = scaler.transform(X_color) if scaler is not None else X_color
        y_pred = model.predict(X_eval)
        acc = accuracy_score(y_np, y_pred)
        f1 = f1_score(y_np, y_pred, average="macro")
        rec = recall_score(y_np, y_pred, average="macro")
        model_name = infer_model_name(run_dir)
        results.append({
            "run_dir": str(run_dir),
            "model_name": model_name,
            "acc": acc,
            "f1_macro": f1,
            "recall_macro": rec
        })
    return results

# 1) Test set completamente sporcato per ogni rumore
noise_keys = list(degradations_camera.keys())
noise_results = {}
for noise_key in noise_keys:
    X_noisy = apply_degradation_batch(X_test_np, degradations_camera[noise_key])
    noise_results[noise_key] = evaluate_checkpoints(X_noisy, y_test_np)
    print(f"Done: {noise_key}")

# 2) Test set con tutte le trasformazioni applicate con percentuali
X_mixed = apply_mixed_degradations(X_test_np, MIXED_PROB_MAP, seed=42)
mixed_results = evaluate_checkpoints(X_mixed, y_test_np)

# Report (clean + noise)
def summarize_results(results, key="acc"):
    by_model = {}
    for r in results:
        by_model.setdefault(r["model_name"], []).append(r[key])
    summary = {m: float(np.mean(vals)) for m, vals in by_model.items()}
    return summary

print("\nMean metrics per model (clean):")
clean_summary = summarize_results(noise_results["clean"], key="acc")
for model_name in sorted(clean_summary.keys()):
    print(f"{model_name}: acc={clean_summary[model_name]:.4f} | f1={summarize_results(noise_results['clean'], 'f1_macro')[model_name]:.4f} | rec={summarize_results(noise_results['clean'], 'recall_macro')[model_name]:.4f}")

print("\nMean metrics per model (mixed):")
mixed_acc = summarize_results(mixed_results, key="acc")
mixed_f1 = summarize_results(mixed_results, key="f1_macro")
mixed_rec = summarize_results(mixed_results, key="recall_macro")
for model_name in sorted(mixed_acc.keys()):
    print(f"{model_name}: acc={mixed_acc[model_name]:.4f} | f1={mixed_f1[model_name]:.4f} | rec={mixed_rec[model_name]:.4f}")

# Plot: confronto performance per rumore (media per modello)
model_names = sorted({r["model_name"] for rs in noise_results.values() for r in rs})
noises = noise_keys
acc_matrix = np.full((len(model_names), len(noises)), np.nan, dtype=np.float32)
f1_matrix = np.full((len(model_names), len(noises)), np.nan, dtype=np.float32)
rec_matrix = np.full((len(model_names), len(noises)), np.nan, dtype=np.float32)
for j, noise in enumerate(noises):
    for i, model in enumerate(model_names):
        accs = [r["acc"] for r in noise_results[noise] if r["model_name"] == model]
        f1s = [r["f1_macro"] for r in noise_results[noise] if r["model_name"] == model]
        recs = [r["recall_macro"] for r in noise_results[noise] if r["model_name"] == model]
        if accs:
            acc_matrix[i, j] = float(np.mean(accs))
        if f1s:
            f1_matrix[i, j] = float(np.mean(f1s))
        if recs:
            rec_matrix[i, j] = float(np.mean(recs))

plt.figure(figsize=(11, 4))
for i, model in enumerate(model_names):
    plt.plot(noises, acc_matrix[i], marker="o", label=model)
plt.ylabel("Mean Test Accuracy")
plt.title("Performance vs rumore (media per modello)")
plt.ylim(0, 1)
plt.xticks(rotation=30, ha="right")
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(11, 4))
for i, model in enumerate(model_names):
    plt.plot(noises, f1_matrix[i], marker="o", label=model)
plt.ylabel("Macro F1")
plt.title("Macro F1 vs rumore (media per modello)")
plt.ylim(0, 1)
plt.xticks(rotation=30, ha="right")
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(11, 4))
for i, model in enumerate(model_names):
    plt.plot(noises, rec_matrix[i], marker="o", label=model)
plt.ylabel("Macro Recall")
plt.title("Macro Recall vs rumore (media per modello)")
plt.ylim(0, 1)
plt.xticks(rotation=30, ha="right")
plt.legend()
plt.tight_layout()
plt.show()

# Plot: confronto clean vs mixed
clean_means = [np.nanmean(acc_matrix[i, noises.index("clean")]) if "clean" in noises else np.nan for i in range(len(model_names))]
mixed_means = [mixed_acc.get(m, np.nan) for m in model_names]
x = np.arange(len(model_names))
plt.figure(figsize=(8, 4))
plt.bar(x - 0.2, clean_means, width=0.4, label="clean")
plt.bar(x + 0.2, mixed_means, width=0.4, label="mixed")
plt.xticks(x, model_names, rotation=20)
plt.ylabel("Mean Test Accuracy")
plt.title("Clean vs Mixed")
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.show()