# Unified HCF+PCA+SVM Pipeline: Clean Features vs Degraded Test
Comprehensive comparison: Color Histogram solo, then with HOG/LBP/GABOR/GLCM.

**Key aspects:**
- Training augmentation: 20% scenario-based (A/B/C with weights 0.4/0.4/0.2)
- Test evaluation: Mixed degradation (60% clean, 15% A, 15% B, 10% C)
- Grid search: Subset-based with cross-validation
- Validation: Proper hold-out split
- Models saved as joblib

## 1. Setup & Imports

In [1]:
import os
import shutil
import subprocess
import time
import math
import random
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as T

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

from skimage.feature import hog, local_binary_pattern, graycomatrix, graycoprops
from skimage.filters import gabor
from skimage.color import rgb2gray

import joblib
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("All libraries imported successfully!")

All libraries imported successfully!


## 2. Global Configuration

In [2]:
# Random state
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

# Image processing
SIZE = 32
VARIETY = False  # False = coarse labels (Apple, not Apple Braeburn)
BATCH_SIZE = 128

# Training augmentation
AUG_RATIO = 0.20  # 20% of training set will be augmented
AUG_DIST = {"A": 0.4, "B": 0.4, "C": 0.2}  # Scenario distribution

# Feature extraction
HIST_BINS = 32

# Grid search parameters
VARIANCE_TARGETS = [0.95]
CV_FOLDS = 3
TUNING_SUBSET_RATIO = 0.20
C_VALUES = [10, 100]
GAMMA_VALUES = [0.01, 0.001]

# Test set degradation probabilities
TEST_DEGRADATION_DISTRIBUTION = [0.60, 0.15, 0.15, 0.10]  # clean, A, B, C

# Model save directory
MODEL_SAVE_DIR = Path("saved_models/hcf_degraded_comparison")
MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)

print("Configuration:")
print(f"  Image size: {SIZE}x{SIZE}")
print(f"  Training augmentation: {AUG_RATIO*100}%")
print(f"  Test degradation: {TEST_DEGRADATION_DISTRIBUTION}")
print(f"  Histogram bins: {HIST_BINS}")
print(f"  PCA variance targets: {VARIANCE_TARGETS}")
print(f"  Grid search: C={C_VALUES}, gamma={GAMMA_VALUES}")
print(f"  Models saved to: {MODEL_SAVE_DIR}")

Configuration:
  Image size: 32x32
  Training augmentation: 20.0%
  Test degradation: [0.6, 0.15, 0.15, 0.1]
  Histogram bins: 32
  PCA variance targets: [0.95]
  Grid search: C=[10, 100], gamma=[0.01, 0.001]
  Models saved to: saved_models/hcf_degraded_comparison


## 3. Dataset Loading

In [3]:
ROOT_DIR = "dataset/fruit360"
TRAIN_DIR = os.path.join(ROOT_DIR, "Training")
TEST_DIR = os.path.join(ROOT_DIR, "Test")

GITHUB_REPO = "https://github.com/fruits-360/fruits-360-100x100"
CLONE_DIR = "dataset/fruits-360-100x100"

def download_dataset():
    os.makedirs("dataset", exist_ok=True)
    subprocess.run(["git", "clone", GITHUB_REPO, CLONE_DIR], check=True)
    os.makedirs(ROOT_DIR, exist_ok=True)
    shutil.move(os.path.join(CLONE_DIR, "Training"), TRAIN_DIR)
    shutil.move(os.path.join(CLONE_DIR, "Test"), TEST_DIR)
    shutil.rmtree(CLONE_DIR, ignore_errors=True)

if not os.path.exists(ROOT_DIR):
    download_dataset()

assert os.path.exists(TRAIN_DIR), f"{TRAIN_DIR} not found"
assert os.path.exists(TEST_DIR), f"{TEST_DIR} not found"
print(f"Dataset ready: {ROOT_DIR}")

Dataset ready: dataset/fruit360


## 4. Dataset Class & Loaders

In [4]:
class Fruit360Dataset(Dataset):
    def __init__(self, root_dir, transform=None, variety=False):
        self.root_dir = root_dir
        self.transform = transform
        self.variety = variety
        self.samples = []

        for class_name in sorted(os.listdir(root_dir)):
            class_dir = os.path.join(root_dir, class_name)
            if not os.path.isdir(class_dir):
                continue

            label = class_name if variety else class_name.split()[0]

            for img_name in os.listdir(class_dir):
                if img_name.lower().endswith((".jpg", ".png")):
                    self.samples.append((os.path.join(class_dir, img_name), label))

        self.labels = sorted({lbl for _, lbl in self.samples})
        self.label_to_idx = {lbl: i for i, lbl in enumerate(self.labels)}
        self.idx_to_label = {i: lbl for lbl, i in self.label_to_idx.items()}

        print(f"{os.path.basename(root_dir)}: {len(self.samples)} images, {len(self.labels)} classes")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label_str = self.samples[idx]
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        label_idx = self.label_to_idx[label_str]
        return img, label_idx


# Create datasets
transform = T.Compose([T.Resize((SIZE, SIZE)), T.ToTensor()])

train_full = Fruit360Dataset(TRAIN_DIR, transform=transform, variety=VARIETY)
test_dataset = Fruit360Dataset(TEST_DIR, transform=transform, variety=VARIETY)

# Split train into train/val
train_size = int(0.7 * len(train_full))
val_size = len(train_full) - train_size

train_dataset, val_dataset = random_split(
    train_full,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(RANDOM_STATE),
)

# Create loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"\nData split: Train {len(train_dataset)}, Val {len(val_dataset)}, Test {len(test_dataset)}")
print(f"Classes: {len(train_full.labels)}")

Training: 130344 images, 79 classes
Test: 43442 images, 79 classes

Data split: Train 91240, Val 39104, Test 43442
Classes: 79


## 5. Convert Loaders to NumPy

In [5]:
def extract_numpy(loader):
    """Extract images and labels from DataLoader to NumPy arrays."""
    X_list, y_list = [], []
    for imgs, labels in loader:
        X_list.append(imgs.numpy())
        y_list.append(labels.numpy())
    X = np.concatenate(X_list, axis=0)
    y = np.concatenate(y_list, axis=0)
    return X, y


X_train, y_train = extract_numpy(train_loader)
X_val, y_val = extract_numpy(val_loader)
X_test, y_test = extract_numpy(test_loader)

print(f"Extracted: X_train {X_train.shape}, X_val {X_val.shape}, X_test {X_test.shape}")

Extracted: X_train (91240, 3, 32, 32), X_val (39104, 3, 32, 32), X_test (43442, 3, 32, 32)


## 6. Augmentation Functions (Scenarios A/B/C)

In [6]:
def clamp_01(x):
    return torch.clamp(x, 0.0, 1.0)


def add_color_patches(x, num_patches, color, alpha_range=(0.4, 0.7), size_range=(0.05, 0.15)):
    _, H, W = x.shape
    out = x.clone()
    for _ in range(num_patches):
        s = np.random.uniform(size_range[0], size_range[1])
        patch_area = s * H * W / 4
        r = np.random.uniform(0.5, 1.5)
        patch_h = int(np.sqrt(patch_area / r))
        patch_w = int(np.sqrt(patch_area * r))
        patch_h = max(1, min(H, patch_h))
        patch_w = max(1, min(W, patch_w))
        top = np.random.randint(0, H - patch_h + 1)
        left = np.random.randint(0, W - patch_w + 1)
        bottom = top + patch_h
        right = left + patch_w
        alpha = np.random.uniform(alpha_range[0], alpha_range[1])
        patch = out[:, top:bottom, left:right]
        blended = alpha * color + (1 - alpha) * patch
        out[:, top:bottom, left:right] = blended
    return clamp_01(out)


def add_occlusion_patch(x, area_ratio=0.1, color=torch.tensor([0.5, 0.5, 0.5]).view(3, 1, 1), alpha=0.5):
    _, H, W = x.shape
    out = x.clone()
    patch_area = area_ratio * H * W
    r = np.random.uniform(0.5, 1.5)
    patch_h = int(np.sqrt(patch_area / r))
    patch_w = int(np.sqrt(patch_area * r))
    patch_h = max(1, min(H, patch_h))
    patch_w = max(1, min(W, patch_w))
    top = np.random.randint(0, H - patch_h + 1)
    left = np.random.randint(0, W - patch_w + 1)
    bottom = top + patch_h
    right = left + patch_w
    patch = out[:, top:bottom, left:right]
    blended = alpha * color + (1 - alpha) * patch
    out[:, top:bottom, left:right] = blended
    return clamp_01(out)


color_dirt = torch.tensor([0.3, 0.25, 0.2]).view(3, 1, 1)
color_bruise = torch.tensor([0.25, 0.2, 0.15]).view(3, 1, 1)


def noise_mild(x):
    return clamp_01(x + torch.randn_like(x) * 0.025)


def dark_mild(x):
    return clamp_01(x * 0.65)


def overexposed_mild(x):
    return clamp_01(x * 1.35)


def dirty_mild(x):
    return add_color_patches(x, num_patches=2, color=color_dirt, alpha_range=(0.5, 0.8), size_range=(0.03, 0.08))


def bruised_mild(x):
    return add_color_patches(x, num_patches=1, color=color_bruise, alpha_range=(0.4, 0.7), size_range=(0.03, 0.08))


def occlusion_small(x):
    return add_occlusion_patch(x, area_ratio=0.10, alpha=0.5)


blur_medium = T.GaussianBlur(kernel_size=5, sigma=1.0)


def scenario_A(x):
    x = blur_medium(x)
    x = noise_mild(x)
    if np.random.rand() < 0.7:
        x = dirty_mild(x)
    return x


def scenario_B(x):
    if np.random.rand() < 0.5:
        x = dark_mild(x)
    else:
        x = overexposed_mild(x)
    x = noise_mild(x)
    return x


def scenario_C(x):
    x = occlusion_small(x)
    if np.random.rand() < 0.5:
        x = bruised_mild(x)
    else:
        x = dirty_mild(x)
    return x


scenario_map = {
    "A": scenario_A,
    "B": scenario_B,
    "C": scenario_C,
}

print("Augmentation scenarios defined: A (blur+noise+dirty), B (light+noise), C (occlusion+bruise)")

Augmentation scenarios defined: A (blur+noise+dirty), B (light+noise), C (occlusion+bruise)


## 7. Apply Augmentation to Training Set

In [7]:
np.random.seed(RANDOM_STATE)

n_augment = int(len(X_train) * AUG_RATIO)
aug_indices = np.random.choice(len(X_train), n_augment, replace=False)

scenario_counts = {"A": 0, "B": 0, "C": 0}

for aug_idx in aug_indices:
    r = np.random.rand()
    if r < AUG_DIST["A"]:
        scenario_name = "A"
    elif r < AUG_DIST["A"] + AUG_DIST["B"]:
        scenario_name = "B"
    else:
        scenario_name = "C"
    scenario_counts[scenario_name] += 1
    img_tensor = torch.from_numpy(X_train[aug_idx]).float()
    aug_img = scenario_map[scenario_name](img_tensor)
    X_train[aug_idx] = aug_img.numpy()

print(f"\nTraining set augmented:")
print(f"  Total augmented: {n_augment} ({AUG_RATIO*100}%)")
print(f"  Scenario distribution: {scenario_counts}")


Training set augmented:
  Total augmented: 18248 (20.0%)
  Scenario distribution: {'A': 7293, 'B': 7358, 'C': 3597}


## 8. Feature Extraction Functions

In [8]:
def color_hist_features(X, bins=HIST_BINS, img_shape=(3, SIZE, SIZE)):
    """Extract HSV color histogram features."""
    n_samples = X.shape[0]
    feats = np.zeros((n_samples, 3 * bins), dtype=np.float32)
    bin_edges = np.linspace(0.0, 1.0, bins + 1)
    for i in range(n_samples):
        img = X[i].reshape(img_shape)
        img = np.transpose(img, (1, 2, 0))
        img = np.clip(img, 0.0, 1.0)
        img_hsv = (img * 255.0).astype(np.uint8)
        img_hsv = cv2.cvtColor(img_hsv, cv2.COLOR_RGB2HSV)
        h, s, v = cv2.split(img_hsv)
        hists = []
        for channel in (h, s, v):
            ch_norm = channel.astype(np.float32) / 255.0
            hist, _ = np.histogram(ch_norm.ravel(), bins=bin_edges, density=True)
            hists.append(hist)
        feats[i] = np.concatenate(hists)
    return feats


def _prepare_img(Xi, img_shape):
    img = Xi.reshape(img_shape)
    img = np.transpose(img, (1, 2, 0))
    return np.clip(img, 0.0, 1.0)


def _to_gray(img):
    return rgb2gray(img)


def hog_features(img_gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), orientations=9):
    """Extract HOG features."""
    return hog(
        img_gray,
        orientations=orientations,
        pixels_per_cell=pixels_per_cell,
        cells_per_block=cells_per_block,
        block_norm="L2-Hys",
        transform_sqrt=True,
        feature_vector=True,
    )


def lbp_features(img_gray, P=8, R=1):
    """Extract LBP features."""
    img_u8 = np.clip(img_gray * 255.0, 0, 255).astype(np.uint8)
    lbp = local_binary_pattern(img_u8, P=P, R=R, method="uniform")
    n_bins = P + 2
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_bins + 1), density=True)
    return hist


def glcm_features(img_gray, distances=(1, 2), angles=(0, np.pi / 4, np.pi / 2, 3 * np.pi / 4)):
    """Extract GLCM features."""
    img_u8 = np.clip(img_gray * 255.0, 0, 255).astype(np.uint8)
    glcm = graycomatrix(
        img_u8,
        distances=distances,
        angles=angles,
        levels=256,
        symmetric=True,
        normed=True,
    )
    props = ["contrast", "dissimilarity", "homogeneity", "energy", "correlation", "ASM"]
    feats = [graycoprops(glcm, p).ravel() for p in props]
    return np.concatenate(feats)


def gabor_features(img_gray, frequencies=(0.1, 0.2, 0.3), thetas=(0, np.pi / 4, np.pi / 2, 3 * np.pi / 4)):
    """Extract Gabor features."""
    feats = []
    for freq in frequencies:
        for theta in thetas:
            real, imag = gabor(img_gray, frequency=freq, theta=theta)
            mag = np.sqrt(real ** 2 + imag ** 2)
            feats.append(mag.mean())
            feats.append(mag.var())
    return np.array(feats, dtype=np.float32)


def compute_feature_blocks(
    X,
    img_shape=(3, SIZE, SIZE),
    color_bins=HIST_BINS,
    hog_params=None,
    lbp_params=None,
    glcm_params=None,
    gabor_params=None,
    feature_keys=None,
):
    """Compute all requested feature blocks."""
    hog_params = hog_params or {}
    lbp_params = lbp_params or {}
    glcm_params = glcm_params or {}
    gabor_params = gabor_params or {}

    if feature_keys is None:
        feature_keys = {"color_hist", "hog", "lbp", "glcm", "gabor"}
    else:
        feature_keys = set(feature_keys)

    blocks = {}
    if "color_hist" in feature_keys:
        blocks["color_hist"] = color_hist_features(X, bins=color_bins, img_shape=img_shape)

    need_gray = any(k in feature_keys for k in ("hog", "lbp", "glcm", "gabor"))
    if need_gray:
        hog_list, lbp_list, glcm_list, gabor_list = [], [], [], []
        for i in range(X.shape[0]):
            img = _prepare_img(X[i], img_shape)
            gray = _to_gray(img)
            if "hog" in feature_keys:
                hog_list.append(hog_features(gray, **hog_params))
            if "lbp" in feature_keys:
                lbp_list.append(lbp_features(gray, **lbp_params))
            if "glcm" in feature_keys:
                glcm_list.append(glcm_features(gray, **glcm_params))
            if "gabor" in feature_keys:
                gabor_list.append(gabor_features(gray, **gabor_params))
        if "hog" in feature_keys:
            blocks["hog"] = np.vstack(hog_list).astype(np.float32)
        if "lbp" in feature_keys:
            blocks["lbp"] = np.vstack(lbp_list).astype(np.float32)
        if "glcm" in feature_keys:
            blocks["glcm"] = np.vstack(glcm_list).astype(np.float32)
        if "gabor" in feature_keys:
            blocks["gabor"] = np.vstack(gabor_list).astype(np.float32)
    return blocks


def concat_feature_blocks(blocks, keys):
    """Concatenate selected feature blocks."""
    return np.concatenate([blocks[k] for k in keys], axis=1)


print("Feature extraction functions defined.")

Feature extraction functions defined.


## 9. Degraded Test Set Evaluation Function

In [9]:
def evaluate_on_degraded_test(
    test_loader,
    scenario_fns,
    probs,
    feature_keys,
    hog_params,
    lbp_params,
    glcm_params,
    gabor_params,
    scaler,
    pca_model,
    clf,
    verbose=True,
):
    """Evaluate model on test set with on-the-fly degradation."""
    np.random.seed(RANDOM_STATE)
    torch.manual_seed(RANDOM_STATE)

    scenario_names = list(scenario_fns.keys())
    all_preds = []
    all_labels = []
    scenario_counts = {name: 0 for name in scenario_names}
    start = time.time()

    for imgs, labels in test_loader:
        imgs_batch = []
        for img in imgs:
            r = np.random.rand()
            # Assign scenario based on cumulative probability
            if r < probs[0]:
                scenario = scenario_names[0]  # clean
            elif r < probs[0] + probs[1]:
                scenario = scenario_names[1]  # A
            elif r < probs[0] + probs[1] + probs[2]:
                scenario = scenario_names[2]  # B
            else:
                scenario = scenario_names[3]  # C

            scenario_counts[scenario] += 1
            x = scenario_fns[scenario](img)
            imgs_batch.append(x.unsqueeze(0))

        imgs_batch = torch.cat(imgs_batch, dim=0)
        X = imgs_batch.numpy()

        # Extract features
        blocks = compute_feature_blocks(
            X,
            img_shape=(3, SIZE, SIZE),
            color_bins=HIST_BINS,
            hog_params=hog_params,
            lbp_params=lbp_params,
            glcm_params=glcm_params,
            gabor_params=gabor_params,
            feature_keys=feature_keys,
        )
        X_feat = concat_feature_blocks(blocks, feature_keys)
        X_sc = scaler.transform(X_feat)
        X_pca = pca_model.transform(X_sc)
        preds = clf.predict(X_pca)

        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    acc = (all_preds == all_labels).mean()
    elapsed = time.time() - start

    if verbose:
        print(f"  Test accuracy (degraded): {acc:.4f}")
        print(f"  Evaluation time: {elapsed:.2f}s")
        print(f"  Scenario breakdown: {scenario_counts}")

    return acc, elapsed, scenario_counts


print("Degraded test evaluation function defined.")

Degraded test evaluation function defined.


## 10. Feature Parameters Definition

In [10]:
# Define optimal parameters for each descriptor
HOG_PARAMS = {"pixels_per_cell": (8, 8), "cells_per_block": (2, 2), "orientations": 9}
LBP_PARAMS = {"P": 8, "R": 1}  # P=neighbors, R=radius
GLCM_PARAMS = {"distances": (1, 2), "angles": (0, np.pi / 4, np.pi / 2, 3 * np.pi / 4)}
GABOR_PARAMS = {"frequencies": (0.1, 0.2, 0.3), "thetas": (0, np.pi / 4, np.pi / 2, 3 * np.pi / 4)}

# Feature combinations
feature_combinations = {
    "Color Histogram": ["color_hist"],
    "CH + HOG": ["color_hist", "hog"],
    "CH + LBP": ["color_hist", "lbp"],
    "CH + GABOR": ["color_hist", "gabor"],
    "CH + GLCM": ["color_hist", "glcm"],
    "CH + HOG + LBP": ["color_hist", "hog", "lbp"],
}

# Scenario functions for on-the-fly test degradation
scenario_functions = {
    "clean": lambda x: x,
    "scenario_A": scenario_A,
    "scenario_B": scenario_B,
    "scenario_C": scenario_C,
}

print("Feature parameters defined:")
print(f"  HOG: {HOG_PARAMS}")
print(f"  LBP: {LBP_PARAMS}")
print(f"  GLCM: {GLCM_PARAMS}")
print(f"  GABOR: {GABOR_PARAMS}")
print(f"\nFeature combinations: {list(feature_combinations.keys())}")

Feature parameters defined:
  HOG: {'pixels_per_cell': (8, 8), 'cells_per_block': (2, 2), 'orientations': 9}
  LBP: {'P': 8, 'R': 1}
  GLCM: {'distances': (1, 2), 'angles': (0, 0.7853981633974483, 1.5707963267948966, 2.356194490192345)}
  GABOR: {'frequencies': (0.1, 0.2, 0.3), 'thetas': (0, 0.7853981633974483, 1.5707963267948966, 2.356194490192345)}

Feature combinations: ['Color Histogram', 'CH + HOG', 'CH + LBP', 'CH + GABOR', 'CH + GLCM', 'CH + HOG + LBP']


## 11. Pipeline: Feature Combination 1 - Color Histogram Only

In [11]:
print("\n" + "="*80)
print("FEATURE COMBINATION 1: Color Histogram (32 bins)")
print("="*80)

feature_keys = feature_combinations["Color Histogram"]
feature_name = "Color_Histogram"
start_total = time.time()

# Extract features
print("\nExtracting features...")
blocks_train = compute_feature_blocks(X_train, feature_keys=feature_keys, color_bins=HIST_BINS)
blocks_val = compute_feature_blocks(X_val, feature_keys=feature_keys, color_bins=HIST_BINS)
blocks_test = compute_feature_blocks(X_test, feature_keys=feature_keys, color_bins=HIST_BINS)

X_train_feat = concat_feature_blocks(blocks_train, feature_keys)
X_val_feat = concat_feature_blocks(blocks_val, feature_keys)
X_test_feat = concat_feature_blocks(blocks_test, feature_keys)

print(f"  Train features: {X_train_feat.shape}")
print(f"  Val features: {X_val_feat.shape}")
print(f"  Test features: {X_test_feat.shape}")

# Scaling
print("\nApplying StandardScaler...")
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_feat)
X_val_sc = scaler.transform(X_val_feat)
X_test_sc = scaler.transform(X_test_feat)

# Grid search on subset
print("\nRunning grid search with cross-validation...")
tuning_subset_size = int(len(X_train_sc) * TUNING_SUBSET_RATIO)
X_train_subset = X_train_sc[:tuning_subset_size]
y_train_subset = y_train[:tuning_subset_size]

best_score = -np.inf
best_params_ch = {}

for target_var in VARIANCE_TARGETS:
    pca_temp = PCA()
    pca_temp.fit(X_train_subset)
    cumsum = np.cumsum(pca_temp.explained_variance_ratio_)
    n_comp = int(np.argmax(cumsum >= target_var) + 1)
    n_comp = max(1, min(n_comp, len(cumsum)))

    X_train_subset_pca = pca_temp.transform(X_train_subset)[:, :n_comp]

    for C in C_VALUES:
        for gamma in GAMMA_VALUES:
            clf = SVC(kernel="rbf", C=C, gamma=gamma, random_state=RANDOM_STATE, verbose=0)
            scores = cross_val_score(clf, X_train_subset_pca, y_train_subset, cv=CV_FOLDS, scoring="accuracy")
            mean_score = scores.mean()

            if mean_score > best_score:
                best_score = mean_score
                best_params_ch = {"variance_target": target_var, "C": C, "gamma": gamma, "n_components": n_comp}

print(f"\nBest parameters: {best_params_ch}")
print(f"Best CV score on subset: {best_score:.4f}")

# Fit final PCA with best parameters
print("\nFitting final PCA...")
pca_ch = PCA(n_components=best_params_ch["n_components"])
X_train_pca = pca_ch.fit_transform(X_train_sc)
X_val_pca = pca_ch.transform(X_val_sc)
X_test_pca = pca_ch.transform(X_test_sc)

# Train final SVM
print("\nTraining final SVM...")
clf_ch = SVC(kernel="rbf", C=best_params_ch["C"], gamma=best_params_ch["gamma"], random_state=RANDOM_STATE, verbose=0)
clf_ch.fit(X_train_pca, y_train)

# Validation accuracy
val_acc_ch = clf_ch.score(X_val_pca, y_val)
print(f"Validation accuracy: {val_acc_ch:.4f}")

# Test accuracy on degraded set
print("\nEvaluating on degraded test set...")
test_acc_ch, test_time_ch, scenario_counts_ch = evaluate_on_degraded_test(
    test_loader,
    scenario_functions,
    TEST_DEGRADATION_DISTRIBUTION,
    feature_keys,
    HOG_PARAMS,
    LBP_PARAMS,
    GLCM_PARAMS,
    GABOR_PARAMS,
    scaler,
    pca_ch,
    clf_ch,
    verbose=True,
)

# Save model
print("\nSaving model...")
model_dir = MODEL_SAVE_DIR / feature_name
model_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(clf_ch, model_dir / "model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")
joblib.dump(pca_ch, model_dir / "pca.joblib")
joblib.dump(best_params_ch, model_dir / "params.joblib")
print(f"Model saved to: {model_dir}")

# Store results
elapsed_total = time.time() - start_total
result_ch = {
    "Feature Combination": "Color Histogram",
    "Feature Dimension": X_train_feat.shape[1],
    "PCA Components": best_params_ch["n_components"],
    "Variance Target": best_params_ch["variance_target"],
    "C": best_params_ch["C"],
    "Gamma": best_params_ch["gamma"],
    "CV Score": f"{best_score:.4f}",
    "Val Accuracy": f"{val_acc_ch:.4f}",
    "Test Accuracy (Degraded)": f"{test_acc_ch:.4f}",
    "Total Time (s)": f"{elapsed_total:.2f}",
}

print(f"\nTotal execution time: {elapsed_total:.2f}s")


FEATURE COMBINATION 1: Color Histogram (32 bins)

Extracting features...
  Train features: (91240, 96)
  Val features: (39104, 96)
  Test features: (43442, 96)

Applying StandardScaler...

Running grid search with cross-validation...

Best parameters: {'variance_target': 0.95, 'C': 100, 'gamma': 0.01, 'n_components': 43}
Best CV score on subset: 0.9654

Fitting final PCA...

Training final SVM...
Validation accuracy: 0.9999

Evaluating on degraded test set...
  Test accuracy (degraded): 0.9703
  Evaluation time: 163.06s
  Scenario breakdown: {'clean': 26070, 'scenario_A': 6650, 'scenario_B': 6421, 'scenario_C': 4301}

Saving model...
Model saved to: saved_models/hcf_degraded_comparison/Color_Histogram

Total execution time: 454.86s


## 12. Pipeline: Feature Combination 2 - CH + HOG

In [12]:
print("\n" + "="*80)
print("FEATURE COMBINATION 2: Color Histogram + HOG")
print("="*80)

feature_keys = feature_combinations["CH + HOG"]
feature_name = "CH_HOG"
start_total = time.time()

print("\nExtracting features...")
blocks_train = compute_feature_blocks(
    X_train,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_val = compute_feature_blocks(
    X_val,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_test = compute_feature_blocks(
    X_test,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)

X_train_feat = concat_feature_blocks(blocks_train, feature_keys)
X_val_feat = concat_feature_blocks(blocks_val, feature_keys)
X_test_feat = concat_feature_blocks(blocks_test, feature_keys)

print(f"  Train features: {X_train_feat.shape}")
print(f"  Val features: {X_val_feat.shape}")
print(f"  Test features: {X_test_feat.shape}")

print("\nApplying StandardScaler...")
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_feat)
X_val_sc = scaler.transform(X_val_feat)
X_test_sc = scaler.transform(X_test_feat)

print("\nRunning grid search with cross-validation...")
tuning_subset_size = int(len(X_train_sc) * TUNING_SUBSET_RATIO)
X_train_subset = X_train_sc[:tuning_subset_size]
y_train_subset = y_train[:tuning_subset_size]

best_score = -np.inf
best_params_hog = {}

for target_var in VARIANCE_TARGETS:
    pca_temp = PCA()
    pca_temp.fit(X_train_subset)
    cumsum = np.cumsum(pca_temp.explained_variance_ratio_)
    n_comp = int(np.argmax(cumsum >= target_var) + 1)
    n_comp = max(1, min(n_comp, len(cumsum)))

    X_train_subset_pca = pca_temp.transform(X_train_subset)[:, :n_comp]

    for C in C_VALUES:
        for gamma in GAMMA_VALUES:
            clf = SVC(kernel="rbf", C=C, gamma=gamma, random_state=RANDOM_STATE, verbose=0)
            scores = cross_val_score(clf, X_train_subset_pca, y_train_subset, cv=CV_FOLDS, scoring="accuracy")
            mean_score = scores.mean()

            if mean_score > best_score:
                best_score = mean_score
                best_params_hog = {"variance_target": target_var, "C": C, "gamma": gamma, "n_components": n_comp}

print(f"\nBest parameters: {best_params_hog}")
print(f"Best CV score on subset: {best_score:.4f}")

print("\nFitting final PCA...")
pca_hog = PCA(n_components=best_params_hog["n_components"])
X_train_pca = pca_hog.fit_transform(X_train_sc)
X_val_pca = pca_hog.transform(X_val_sc)
X_test_pca = pca_hog.transform(X_test_sc)

print("\nTraining final SVM...")
clf_hog = SVC(kernel="rbf", C=best_params_hog["C"], gamma=best_params_hog["gamma"], random_state=RANDOM_STATE, verbose=0)
clf_hog.fit(X_train_pca, y_train)

val_acc_hog = clf_hog.score(X_val_pca, y_val)
print(f"Validation accuracy: {val_acc_hog:.4f}")

print("\nEvaluating on degraded test set...")
test_acc_hog, test_time_hog, scenario_counts_hog = evaluate_on_degraded_test(
    test_loader,
    scenario_functions,
    TEST_DEGRADATION_DISTRIBUTION,
    feature_keys,
    HOG_PARAMS,
    LBP_PARAMS,
    GLCM_PARAMS,
    GABOR_PARAMS,
    scaler,
    pca_hog,
    clf_hog,
    verbose=True,
)

print("\nSaving model...")
model_dir = MODEL_SAVE_DIR / feature_name
model_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(clf_hog, model_dir / "model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")
joblib.dump(pca_hog, model_dir / "pca.joblib")
joblib.dump(best_params_hog, model_dir / "params.joblib")
print(f"Model saved to: {model_dir}")

elapsed_total = time.time() - start_total
result_hog = {
    "Feature Combination": "CH + HOG",
    "Feature Dimension": X_train_feat.shape[1],
    "PCA Components": best_params_hog["n_components"],
    "Variance Target": best_params_hog["variance_target"],
    "C": best_params_hog["C"],
    "Gamma": best_params_hog["gamma"],
    "CV Score": f"{best_score:.4f}",
    "Val Accuracy": f"{val_acc_hog:.4f}",
    "Test Accuracy (Degraded)": f"{test_acc_hog:.4f}",
    "Total Time (s)": f"{elapsed_total:.2f}",
}

print(f"\nTotal execution time: {elapsed_total:.2f}s")


FEATURE COMBINATION 2: Color Histogram + HOG

Extracting features...
  Train features: (91240, 420)
  Val features: (39104, 420)
  Test features: (43442, 420)

Applying StandardScaler...

Running grid search with cross-validation...

Best parameters: {'variance_target': 0.95, 'C': 100, 'gamma': 0.001, 'n_components': 152}
Best CV score on subset: 0.9685

Fitting final PCA...

Training final SVM...
Validation accuracy: 0.9999

Evaluating on degraded test set...
  Test accuracy (degraded): 0.9611
  Evaluation time: 361.71s
  Scenario breakdown: {'clean': 26070, 'scenario_A': 6650, 'scenario_B': 6421, 'scenario_C': 4301}

Saving model...
Model saved to: saved_models/hcf_degraded_comparison/CH_HOG

Total execution time: 1304.77s


## 13. Pipeline: Feature Combination 3 - CH + LBP

In [13]:
print("\n" + "="*80)
print("FEATURE COMBINATION 3: Color Histogram + LBP")
print("="*80)

feature_keys = feature_combinations["CH + LBP"]
feature_name = "CH_LBP"
start_total = time.time()

print("\nExtracting features...")
blocks_train = compute_feature_blocks(
    X_train,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_val = compute_feature_blocks(
    X_val,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_test = compute_feature_blocks(
    X_test,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)

X_train_feat = concat_feature_blocks(blocks_train, feature_keys)
X_val_feat = concat_feature_blocks(blocks_val, feature_keys)
X_test_feat = concat_feature_blocks(blocks_test, feature_keys)

print(f"  Train features: {X_train_feat.shape}")
print(f"  Val features: {X_val_feat.shape}")
print(f"  Test features: {X_test_feat.shape}")

print("\nApplying StandardScaler...")
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_feat)
X_val_sc = scaler.transform(X_val_feat)
X_test_sc = scaler.transform(X_test_feat)

print("\nRunning grid search with cross-validation...")
tuning_subset_size = int(len(X_train_sc) * TUNING_SUBSET_RATIO)
X_train_subset = X_train_sc[:tuning_subset_size]
y_train_subset = y_train[:tuning_subset_size]

best_score = -np.inf
best_params_lbp = {}

for target_var in VARIANCE_TARGETS:
    pca_temp = PCA()
    pca_temp.fit(X_train_subset)
    cumsum = np.cumsum(pca_temp.explained_variance_ratio_)
    n_comp = int(np.argmax(cumsum >= target_var) + 1)
    n_comp = max(1, min(n_comp, len(cumsum)))

    X_train_subset_pca = pca_temp.transform(X_train_subset)[:, :n_comp]

    for C in C_VALUES:
        for gamma in GAMMA_VALUES:
            clf = SVC(kernel="rbf", C=C, gamma=gamma, random_state=RANDOM_STATE, verbose=0)
            scores = cross_val_score(clf, X_train_subset_pca, y_train_subset, cv=CV_FOLDS, scoring="accuracy")
            mean_score = scores.mean()

            if mean_score > best_score:
                best_score = mean_score
                best_params_lbp = {"variance_target": target_var, "C": C, "gamma": gamma, "n_components": n_comp}

print(f"\nBest parameters: {best_params_lbp}")
print(f"Best CV score on subset: {best_score:.4f}")

print("\nFitting final PCA...")
pca_lbp = PCA(n_components=best_params_lbp["n_components"])
X_train_pca = pca_lbp.fit_transform(X_train_sc)
X_val_pca = pca_lbp.transform(X_val_sc)
X_test_pca = pca_lbp.transform(X_test_sc)

print("\nTraining final SVM...")
clf_lbp = SVC(kernel="rbf", C=best_params_lbp["C"], gamma=best_params_lbp["gamma"], random_state=RANDOM_STATE, verbose=0)
clf_lbp.fit(X_train_pca, y_train)

val_acc_lbp = clf_lbp.score(X_val_pca, y_val)
print(f"Validation accuracy: {val_acc_lbp:.4f}")

print("\nEvaluating on degraded test set...")
test_acc_lbp, test_time_lbp, scenario_counts_lbp = evaluate_on_degraded_test(
    test_loader,
    scenario_functions,
    TEST_DEGRADATION_DISTRIBUTION,
    feature_keys,
    HOG_PARAMS,
    LBP_PARAMS,
    GLCM_PARAMS,
    GABOR_PARAMS,
    scaler,
    pca_lbp,
    clf_lbp,
    verbose=True,
)

print("\nSaving model...")
model_dir = MODEL_SAVE_DIR / feature_name
model_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(clf_lbp, model_dir / "model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")
joblib.dump(pca_lbp, model_dir / "pca.joblib")
joblib.dump(best_params_lbp, model_dir / "params.joblib")
print(f"Model saved to: {model_dir}")

elapsed_total = time.time() - start_total
result_lbp = {
    "Feature Combination": "CH + LBP",
    "Feature Dimension": X_train_feat.shape[1],
    "PCA Components": best_params_lbp["n_components"],
    "Variance Target": best_params_lbp["variance_target"],
    "C": best_params_lbp["C"],
    "Gamma": best_params_lbp["gamma"],
    "CV Score": f"{best_score:.4f}",
    "Val Accuracy": f"{val_acc_lbp:.4f}",
    "Test Accuracy (Degraded)": f"{test_acc_lbp:.4f}",
    "Total Time (s)": f"{elapsed_total:.2f}",
}

print(f"\nTotal execution time: {elapsed_total:.2f}s")


FEATURE COMBINATION 3: Color Histogram + LBP

Extracting features...
  Train features: (91240, 106)
  Val features: (39104, 106)
  Test features: (43442, 106)

Applying StandardScaler...

Running grid search with cross-validation...

Best parameters: {'variance_target': 0.95, 'C': 100, 'gamma': 0.01, 'n_components': 47}
Best CV score on subset: 0.9653

Fitting final PCA...

Training final SVM...
Validation accuracy: 0.9999

Evaluating on degraded test set...
  Test accuracy (degraded): 0.9719
  Evaluation time: 186.88s
  Scenario breakdown: {'clean': 26070, 'scenario_A': 6650, 'scenario_B': 6421, 'scenario_C': 4301}

Saving model...
Model saved to: saved_models/hcf_degraded_comparison/CH_LBP

Total execution time: 522.88s


## 14. Pipeline: Feature Combination 4 - CH + GABOR

In [14]:
print("\n" + "="*80)
print("FEATURE COMBINATION 4: Color Histogram + GABOR")
print("="*80)

feature_keys = feature_combinations["CH + GABOR"]
feature_name = "CH_GABOR"
start_total = time.time()

print("\nExtracting features...")
blocks_train = compute_feature_blocks(
    X_train,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_val = compute_feature_blocks(
    X_val,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_test = compute_feature_blocks(
    X_test,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)

X_train_feat = concat_feature_blocks(blocks_train, feature_keys)
X_val_feat = concat_feature_blocks(blocks_val, feature_keys)
X_test_feat = concat_feature_blocks(blocks_test, feature_keys)

print(f"  Train features: {X_train_feat.shape}")
print(f"  Val features: {X_val_feat.shape}")
print(f"  Test features: {X_test_feat.shape}")

print("\nApplying StandardScaler...")
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_feat)
X_val_sc = scaler.transform(X_val_feat)
X_test_sc = scaler.transform(X_test_feat)

print("\nRunning grid search with cross-validation...")
tuning_subset_size = int(len(X_train_sc) * TUNING_SUBSET_RATIO)
X_train_subset = X_train_sc[:tuning_subset_size]
y_train_subset = y_train[:tuning_subset_size]

best_score = -np.inf
best_params_gabor = {}

for target_var in VARIANCE_TARGETS:
    pca_temp = PCA()
    pca_temp.fit(X_train_subset)
    cumsum = np.cumsum(pca_temp.explained_variance_ratio_)
    n_comp = int(np.argmax(cumsum >= target_var) + 1)
    n_comp = max(1, min(n_comp, len(cumsum)))

    X_train_subset_pca = pca_temp.transform(X_train_subset)[:, :n_comp]

    for C in C_VALUES:
        for gamma in GAMMA_VALUES:
            clf = SVC(kernel="rbf", C=C, gamma=gamma, random_state=RANDOM_STATE, verbose=0)
            scores = cross_val_score(clf, X_train_subset_pca, y_train_subset, cv=CV_FOLDS, scoring="accuracy")
            mean_score = scores.mean()

            if mean_score > best_score:
                best_score = mean_score
                best_params_gabor = {"variance_target": target_var, "C": C, "gamma": gamma, "n_components": n_comp}

print(f"\nBest parameters: {best_params_gabor}")
print(f"Best CV score on subset: {best_score:.4f}")

print("\nFitting final PCA...")
pca_gabor = PCA(n_components=best_params_gabor["n_components"])
X_train_pca = pca_gabor.fit_transform(X_train_sc)
X_val_pca = pca_gabor.transform(X_val_sc)
X_test_pca = pca_gabor.transform(X_test_sc)

print("\nTraining final SVM...")
clf_gabor = SVC(kernel="rbf", C=best_params_gabor["C"], gamma=best_params_gabor["gamma"], random_state=RANDOM_STATE, verbose=0)
clf_gabor.fit(X_train_pca, y_train)

val_acc_gabor = clf_gabor.score(X_val_pca, y_val)
print(f"Validation accuracy: {val_acc_gabor:.4f}")

print("\nEvaluating on degraded test set...")
test_acc_gabor, test_time_gabor, scenario_counts_gabor = evaluate_on_degraded_test(
    test_loader,
    scenario_functions,
    TEST_DEGRADATION_DISTRIBUTION,
    feature_keys,
    HOG_PARAMS,
    LBP_PARAMS,
    GLCM_PARAMS,
    GABOR_PARAMS,
    scaler,
    pca_gabor,
    clf_gabor,
    verbose=True,
)

print("\nSaving model...")
model_dir = MODEL_SAVE_DIR / feature_name
model_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(clf_gabor, model_dir / "model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")
joblib.dump(pca_gabor, model_dir / "pca.joblib")
joblib.dump(best_params_gabor, model_dir / "params.joblib")
print(f"Model saved to: {model_dir}")

elapsed_total = time.time() - start_total
result_gabor = {
    "Feature Combination": "CH + GABOR",
    "Feature Dimension": X_train_feat.shape[1],
    "PCA Components": best_params_gabor["n_components"],
    "Variance Target": best_params_gabor["variance_target"],
    "C": best_params_gabor["C"],
    "Gamma": best_params_gabor["gamma"],
    "CV Score": f"{best_score:.4f}",
    "Val Accuracy": f"{val_acc_gabor:.4f}",
    "Test Accuracy (Degraded)": f"{test_acc_gabor:.4f}",
    "Total Time (s)": f"{elapsed_total:.2f}",
}

print(f"\nTotal execution time: {elapsed_total:.2f}s")


FEATURE COMBINATION 4: Color Histogram + GABOR

Extracting features...
  Train features: (91240, 120)
  Val features: (39104, 120)
  Test features: (43442, 120)

Applying StandardScaler...

Running grid search with cross-validation...

Best parameters: {'variance_target': 0.95, 'C': 100, 'gamma': 0.01, 'n_components': 50}
Best CV score on subset: 0.9708

Fitting final PCA...

Training final SVM...
Validation accuracy: 1.0000

Evaluating on degraded test set...
  Test accuracy (degraded): 0.9795
  Evaluation time: 2783.37s
  Scenario breakdown: {'clean': 26070, 'scenario_A': 6650, 'scenario_B': 6421, 'scenario_C': 4301}

Saving model...
Model saved to: saved_models/hcf_degraded_comparison/CH_GABOR

Total execution time: 12824.14s


## 15. Pipeline: Feature Combination 5 - CH + GLCM

In [15]:
print("\n" + "="*80)
print("FEATURE COMBINATION 5: Color Histogram + GLCM")
print("="*80)

feature_keys = feature_combinations["CH + GLCM"]
feature_name = "CH_GLCM"
start_total = time.time()

print("\nExtracting features...")
blocks_train = compute_feature_blocks(
    X_train,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_val = compute_feature_blocks(
    X_val,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_test = compute_feature_blocks(
    X_test,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)

X_train_feat = concat_feature_blocks(blocks_train, feature_keys)
X_val_feat = concat_feature_blocks(blocks_val, feature_keys)
X_test_feat = concat_feature_blocks(blocks_test, feature_keys)

print(f"  Train features: {X_train_feat.shape}")
print(f"  Val features: {X_val_feat.shape}")
print(f"  Test features: {X_test_feat.shape}")

print("\nApplying StandardScaler...")
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_feat)
X_val_sc = scaler.transform(X_val_feat)
X_test_sc = scaler.transform(X_test_feat)

print("\nRunning grid search with cross-validation...")
tuning_subset_size = int(len(X_train_sc) * TUNING_SUBSET_RATIO)
X_train_subset = X_train_sc[:tuning_subset_size]
y_train_subset = y_train[:tuning_subset_size]

best_score = -np.inf
best_params_glcm = {}

for target_var in VARIANCE_TARGETS:
    pca_temp = PCA()
    pca_temp.fit(X_train_subset)
    cumsum = np.cumsum(pca_temp.explained_variance_ratio_)
    n_comp = int(np.argmax(cumsum >= target_var) + 1)
    n_comp = max(1, min(n_comp, len(cumsum)))

    X_train_subset_pca = pca_temp.transform(X_train_subset)[:, :n_comp]

    for C in C_VALUES:
        for gamma in GAMMA_VALUES:
            clf = SVC(kernel="rbf", C=C, gamma=gamma, random_state=RANDOM_STATE, verbose=0)
            scores = cross_val_score(clf, X_train_subset_pca, y_train_subset, cv=CV_FOLDS, scoring="accuracy")
            mean_score = scores.mean()

            if mean_score > best_score:
                best_score = mean_score
                best_params_glcm = {"variance_target": target_var, "C": C, "gamma": gamma, "n_components": n_comp}

print(f"\nBest parameters: {best_params_glcm}")
print(f"Best CV score on subset: {best_score:.4f}")

print("\nFitting final PCA...")
pca_glcm = PCA(n_components=best_params_glcm["n_components"])
X_train_pca = pca_glcm.fit_transform(X_train_sc)
X_val_pca = pca_glcm.transform(X_val_sc)
X_test_pca = pca_glcm.transform(X_test_sc)

print("\nTraining final SVM...")
clf_glcm = SVC(kernel="rbf", C=best_params_glcm["C"], gamma=best_params_glcm["gamma"], random_state=RANDOM_STATE, verbose=0)
clf_glcm.fit(X_train_pca, y_train)

val_acc_glcm = clf_glcm.score(X_val_pca, y_val)
print(f"Validation accuracy: {val_acc_glcm:.4f}")

print("\nEvaluating on degraded test set...")
test_acc_glcm, test_time_glcm, scenario_counts_glcm = evaluate_on_degraded_test(
    test_loader,
    scenario_functions,
    TEST_DEGRADATION_DISTRIBUTION,
    feature_keys,
    HOG_PARAMS,
    LBP_PARAMS,
    GLCM_PARAMS,
    GABOR_PARAMS,
    scaler,
    pca_glcm,
    clf_glcm,
    verbose=True,
)

print("\nSaving model...")
model_dir = MODEL_SAVE_DIR / feature_name
model_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(clf_glcm, model_dir / "model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")
joblib.dump(pca_glcm, model_dir / "pca.joblib")
joblib.dump(best_params_glcm, model_dir / "params.joblib")
print(f"Model saved to: {model_dir}")

elapsed_total = time.time() - start_total
result_glcm = {
    "Feature Combination": "CH + GLCM",
    "Feature Dimension": X_train_feat.shape[1],
    "PCA Components": best_params_glcm["n_components"],
    "Variance Target": best_params_glcm["variance_target"],
    "C": best_params_glcm["C"],
    "Gamma": best_params_glcm["gamma"],
    "CV Score": f"{best_score:.4f}",
    "Val Accuracy": f"{val_acc_glcm:.4f}",
    "Test Accuracy (Degraded)": f"{test_acc_glcm:.4f}",
    "Total Time (s)": f"{elapsed_total:.2f}",
}

print(f"\nTotal execution time: {elapsed_total:.2f}s")


FEATURE COMBINATION 5: Color Histogram + GLCM

Extracting features...
  Train features: (91240, 144)
  Val features: (39104, 144)
  Test features: (43442, 144)

Applying StandardScaler...

Running grid search with cross-validation...

Best parameters: {'variance_target': 0.95, 'C': 100, 'gamma': 0.01, 'n_components': 43}
Best CV score on subset: 0.9640

Fitting final PCA...

Training final SVM...
Validation accuracy: 1.0000

Evaluating on degraded test set...
  Test accuracy (degraded): 0.9742
  Evaluation time: 1269.00s
  Scenario breakdown: {'clean': 26070, 'scenario_A': 6650, 'scenario_B': 6421, 'scenario_C': 4301}

Saving model...
Model saved to: saved_models/hcf_degraded_comparison/CH_GLCM

Total execution time: 5721.52s


## 16. Pipeline: Feature Combination 6 - CH + HOG + LBP

In [16]:
print("\n" + "="*80)
print("FEATURE COMBINATION 6: Color Histogram + HOG + LBP")
print("="*80)

feature_keys = feature_combinations["CH + HOG + LBP"]
feature_name = "CH_HOG_LBP"
start_total = time.time()

print("\nExtracting features...")
blocks_train = compute_feature_blocks(
    X_train,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_val = compute_feature_blocks(
    X_val,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)
blocks_test = compute_feature_blocks(
    X_test,
    feature_keys=feature_keys,
    color_bins=HIST_BINS,
    hog_params=HOG_PARAMS,
    lbp_params=LBP_PARAMS,
    glcm_params=GLCM_PARAMS,
    gabor_params=GABOR_PARAMS,
)

X_train_feat = concat_feature_blocks(blocks_train, feature_keys)
X_val_feat = concat_feature_blocks(blocks_val, feature_keys)
X_test_feat = concat_feature_blocks(blocks_test, feature_keys)

print(f"  Train features: {X_train_feat.shape}")
print(f"  Val features: {X_val_feat.shape}")
print(f"  Test features: {X_test_feat.shape}")

print("\nApplying StandardScaler...")
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_feat)
X_val_sc = scaler.transform(X_val_feat)
X_test_sc = scaler.transform(X_test_feat)

print("\nRunning grid search with cross-validation...")
tuning_subset_size = int(len(X_train_sc) * TUNING_SUBSET_RATIO)
X_train_subset = X_train_sc[:tuning_subset_size]
y_train_subset = y_train[:tuning_subset_size]

best_score = -np.inf
best_params_hog_lbp = {}

for target_var in VARIANCE_TARGETS:
    pca_temp = PCA()
    pca_temp.fit(X_train_subset)
    cumsum = np.cumsum(pca_temp.explained_variance_ratio_)
    n_comp = int(np.argmax(cumsum >= target_var) + 1)
    n_comp = max(1, min(n_comp, len(cumsum)))

    X_train_subset_pca = pca_temp.transform(X_train_subset)[:, :n_comp]

    for C in C_VALUES:
        for gamma in GAMMA_VALUES:
            clf = SVC(kernel="rbf", C=C, gamma=gamma, random_state=RANDOM_STATE, verbose=0)
            scores = cross_val_score(clf, X_train_subset_pca, y_train_subset, cv=CV_FOLDS, scoring="accuracy")
            mean_score = scores.mean()

            if mean_score > best_score:
                best_score = mean_score
                best_params_hog_lbp = {"variance_target": target_var, "C": C, "gamma": gamma, "n_components": n_comp}

print(f"\nBest parameters: {best_params_hog_lbp}")
print(f"Best CV score on subset: {best_score:.4f}")

print("\nFitting final PCA...")
pca_hog_lbp = PCA(n_components=best_params_hog_lbp["n_components"])
X_train_pca = pca_hog_lbp.fit_transform(X_train_sc)
X_val_pca = pca_hog_lbp.transform(X_val_sc)
X_test_pca = pca_hog_lbp.transform(X_test_sc)

print("\nTraining final SVM...")
clf_hog_lbp = SVC(kernel="rbf", C=best_params_hog_lbp["C"], gamma=best_params_hog_lbp["gamma"], random_state=RANDOM_STATE, verbose=0)
clf_hog_lbp.fit(X_train_pca, y_train)

val_acc_hog_lbp = clf_hog_lbp.score(X_val_pca, y_val)
print(f"Validation accuracy: {val_acc_hog_lbp:.4f}")

print("\nEvaluating on degraded test set...")
test_acc_hog_lbp, test_time_hog_lbp, scenario_counts_hog_lbp = evaluate_on_degraded_test(
    test_loader,
    scenario_functions,
    TEST_DEGRADATION_DISTRIBUTION,
    feature_keys,
    HOG_PARAMS,
    LBP_PARAMS,
    GLCM_PARAMS,
    GABOR_PARAMS,
    scaler,
    pca_hog_lbp,
    clf_hog_lbp,
    verbose=True,
)

print("\nSaving model...")
model_dir = MODEL_SAVE_DIR / feature_name
model_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(clf_hog_lbp, model_dir / "model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")
joblib.dump(pca_hog_lbp, model_dir / "pca.joblib")
joblib.dump(best_params_hog_lbp, model_dir / "params.joblib")
print(f"Model saved to: {model_dir}")

elapsed_total = time.time() - start_total
result_hog_lbp = {
    "Feature Combination": "CH + HOG + LBP",
    "Feature Dimension": X_train_feat.shape[1],
    "PCA Components": best_params_hog_lbp["n_components"],
    "Variance Target": best_params_hog_lbp["variance_target"],
    "C": best_params_hog_lbp["C"],
    "Gamma": best_params_hog_lbp["gamma"],
    "CV Score": f"{best_score:.4f}",
    "Val Accuracy": f"{val_acc_hog_lbp:.4f}",
    "Test Accuracy (Degraded)": f"{test_acc_hog_lbp:.4f}",
    "Total Time (s)": f"{elapsed_total:.2f}",
}

print(f"\nTotal execution time: {elapsed_total:.2f}s")


FEATURE COMBINATION 6: Color Histogram + HOG + LBP

Extracting features...
  Train features: (91240, 430)
  Val features: (39104, 430)
  Test features: (43442, 430)

Applying StandardScaler...

Running grid search with cross-validation...

Best parameters: {'variance_target': 0.95, 'C': 100, 'gamma': 0.001, 'n_components': 155}
Best CV score on subset: 0.9683

Fitting final PCA...

Training final SVM...
Validation accuracy: 0.9999

Evaluating on degraded test set...
  Test accuracy (degraded): 0.9621
  Evaluation time: 474.57s
  Scenario breakdown: {'clean': 26070, 'scenario_A': 6650, 'scenario_B': 6421, 'scenario_C': 4301}

Saving model...
Model saved to: saved_models/hcf_degraded_comparison/CH_HOG_LBP

Total execution time: 1616.60s


## 17. Final Comparison Table

In [18]:
print("\n" + "="*100)
print("FINAL RESULTS COMPARISON")
print("="*100)

# Compile all results
all_results = [
    result_ch,
    result_hog,
    result_lbp,
    result_gabor,
    result_glcm,
    result_hog_lbp,
]

# Create DataFrame
results_df = pd.DataFrame(all_results)

# Sort by Test Accuracy (descending)
results_df["Test Accuracy (Degraded)"] = results_df["Test Accuracy (Degraded)"].astype(float)
results_df_sorted = results_df.sort_values("Test Accuracy (Degraded)", ascending=False).reset_index(drop=True)

print("\n" + results_df_sorted.to_string(index=True))

# Summary statistics
print("\n" + "-"*100)
print("SUMMARY STATISTICS")
print("-"*100)

best_idx = results_df_sorted["Test Accuracy (Degraded)"].idxmax()
best_result = results_df_sorted.loc[best_idx]

print(f"\nBest performing: {best_result['Feature Combination']}")
print(f"  Test Accuracy: {best_result['Test Accuracy (Degraded)']}")
print(f"  Validation Accuracy: {best_result['Val Accuracy']}")
print(f"  PCA Components: {best_result['PCA Components']}")
print(f"  SVM C={best_result['C']}, gamma={best_result['Gamma']}")

print("\nRanking by Test Accuracy (Degraded Test Set):")
for idx, (_, row) in enumerate(results_df_sorted.iterrows(), 1):
    print(f"  {idx}. {row['Feature Combination']:20s} - Test: {row['Test Accuracy (Degraded)']:>6.4f}, Val: {row['Val Accuracy']:>6s}")

print(f"\nAverage Test Accuracy: {results_df_sorted['Test Accuracy (Degraded)'].astype(float).mean():.4f}")
print(f"Average Validation Accuracy: {results_df_sorted['Val Accuracy'].astype(float).mean():.4f}")
print(f"Total Execution Time: {results_df['Total Time (s)'].astype(float).sum():.2f}s")

print("\n" + "="*100)
print(f"All models saved to: {MODEL_SAVE_DIR}")
print("="*100)


FINAL RESULTS COMPARISON

  Feature Combination  Feature Dimension  PCA Components  Variance Target    C  Gamma CV Score Val Accuracy  Test Accuracy (Degraded) Total Time (s)
0          CH + GABOR                120              50             0.95  100  0.010   0.9708       1.0000                    0.9795       12824.14
1           CH + GLCM                144              43             0.95  100  0.010   0.9640       1.0000                    0.9742        5721.52
2            CH + LBP                106              47             0.95  100  0.010   0.9653       0.9999                    0.9719         522.88
3     Color Histogram                 96              43             0.95  100  0.010   0.9654       0.9999                    0.9703         454.86
4      CH + HOG + LBP                430             155             0.95  100  0.001   0.9683       0.9999                    0.9621        1616.60
5            CH + HOG                420             152             0.95  100 

## 18. Export Results to CSV

In [19]:
# Save detailed results to CSV
csv_path = Path("results/unified_comparison_results.csv")
csv_path.parent.mkdir(parents=True, exist_ok=True)

results_df_sorted.to_csv(csv_path, index=True)
print(f"Results saved to: {csv_path}")

# Also save the sorted version with better formatting
results_df_sorted.to_csv(csv_path.with_stem("unified_comparison_results_sorted"), index=True)
print(f"Sorted results saved to: {csv_path.with_stem('unified_comparison_results_sorted')}")

Results saved to: results/unified_comparison_results.csv
Sorted results saved to: results/unified_comparison_results_sorted.csv
