# Fruits-360 Pipeline (Color Histogram Baseline)
Baseline notebook using color histogram features and classic ML models.

**Key knobs:** `size`, `batch`, `RANDOM_STATE`, `color_bins`, `k_list`, `C_list`, `rf_depth_list`.

In [1]:
import os
import random
import numpy as np
import torch
from torch.utils.data import DataLoader, random_split
import torchvision.transforms as T
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from utils.pipeline_utils import (
    download_dataset,
    FruitFolderDataset,
    dataloader_to_numpy,
    save_checkpoint,
    color_hist_features,
 )
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
ROOT_DIR = "dataset/fruit360"
TRAIN_DIR = os.path.join(ROOT_DIR, "Training")
TEST_DIR = os.path.join(ROOT_DIR, "Test")

GITHUB_REPO = "https://github.com/fruits-360/fruits-360-100x100"
CLONE_DIR = "dataset/fruits-360-100x100"

if not os.path.exists(ROOT_DIR):
    download_dataset(
        root_dir=ROOT_DIR,
        train_dir=TRAIN_DIR,
        test_dir=TEST_DIR,
        github_repo=GITHUB_REPO,
        clone_dir=CLONE_DIR,
    )

assert os.path.exists(TRAIN_DIR)
assert os.path.exists(TEST_DIR)

print(f"Train dir: {TRAIN_DIR}")
print(f"Test dir: {TEST_DIR}")

Train dir: dataset/fruit360/Training
Test dir: dataset/fruit360/Test


In [3]:
size = 32
batch = 128
RANDOM_STATE = 42

## Experiment: HSV histogram + SVM vs image size
Runs HSV histogram + SVM on clean test set for 8x8, 16x16, 32x32.
Also saves checkpoints for each size.

In [4]:
sizes = [8, 16, 32]
color_bins = 16
C_list = [10, 20, 40]

def run_hist_svm_for_size(img_size):
    transform = T.Compose([
        T.Resize((img_size, img_size)),
        T.ToTensor(),
    ])

    full_train_dataset = FruitFolderDataset(TRAIN_DIR, transform=transform, variety=False)
    test_dataset = FruitFolderDataset(TEST_DIR, transform=transform, variety=False)

    train_size = int(0.7 * len(full_train_dataset))
    val_size = len(full_train_dataset) - train_size

    train_dataset, val_dataset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=torch.Generator().manual_seed(RANDOM_STATE),
    )

    train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=False)

    X_train_np, y_train_np = dataloader_to_numpy(train_loader)
    X_val_np, y_val_np = dataloader_to_numpy(val_loader)
    X_test_np, y_test_np = dataloader_to_numpy(test_loader)

    X_train_color = color_hist_features(X_train_np, bins=color_bins, img_shape=(3, img_size, img_size))
    X_val_color = color_hist_features(X_val_np, bins=color_bins, img_shape=(3, img_size, img_size))
    X_test_color = color_hist_features(X_test_np, bins=color_bins, img_shape=(3, img_size, img_size))

    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_train_color)
    X_val_std = scaler.transform(X_val_color)
    X_test_std = scaler.transform(X_test_color)

    best_svm_acc = 0.0
    best_C = None
    best_svm_model = None
    for C in C_list:
        svm = SVC(kernel="rbf", C=C)
        svm.fit(X_train_std, y_train_np)
        y_val_pred = svm.predict(X_val_std)
        acc = accuracy_score(y_val_np, y_val_pred)
        if acc > best_svm_acc:
            best_svm_acc = acc
            best_C = C
            best_svm_model = svm

    y_test_pred = best_svm_model.predict(X_test_std)
    test_acc = accuracy_score(y_test_np, y_test_pred)

    meta_base = {
        "task": "fruit360",
        "split": "All",
        "feature": "color_hist",
        "img_size": img_size,
        "bins": color_bins,
        "seed": RANDOM_STATE,
        "n_classes": len(full_train_dataset.label_to_idx),
        "labels": full_train_dataset.labels,
        "label_to_idx": full_train_dataset.label_to_idx,
    }

    ckpt_path = save_checkpoint(
        best_svm_model,
        scaler,
        {
            **meta_base,
            "model": "svm",
            "params": {"kernel": "rbf", "C": best_C},
        },
        save_meta=False,
    )

    return {
        "img_size": img_size,
        "val_acc": best_svm_acc,
        "test_acc": test_acc,
        "best_C": best_C,
        "ckpt_path": ckpt_path,
        "model": best_svm_model,
        "scaler": scaler,
        "X_test_np": X_test_np,
        "y_test_np": y_test_np,
    }

results_by_size = []
models_by_size = {}

for img_size in sizes:
    np.random.seed(RANDOM_STATE)
    torch.manual_seed(RANDOM_STATE)
    out = run_hist_svm_for_size(img_size)
    results_by_size.append(out)
    models_by_size[img_size] = out
    print(
        f"Size {img_size}x{img_size} -> Val: {out['val_acc']:.4f}, "
        f"Test: {out['test_acc']:.4f}, C={out['best_C']}"
    )

print("\nCheckpoint paths:")
for out in results_by_size:
    print(f"  {out['img_size']}x{out['img_size']}: {out['ckpt_path']}")


Training -> 130344 images, 79 classes
Test -> 43442 images, 79 classes
Size 8x8 -> Val: 0.9984, Test: 0.9650, C=40
Training -> 130344 images, 79 classes
Test -> 43442 images, 79 classes
Size 16x16 -> Val: 0.9999, Test: 0.9792, C=40
Training -> 130344 images, 79 classes
Test -> 43442 images, 79 classes
Size 32x32 -> Val: 1.0000, Test: 0.9837, C=40

Checkpoint paths:
  8x8: artifacts/checkpoints/fruit360/color_hist/svm/20260217-184546_6751bc48
  16x16: artifacts/checkpoints/fruit360/color_hist/svm/20260217-185416_29ca2320
  32x32: artifacts/checkpoints/fruit360/color_hist/svm/20260217-190103_0dbeb873


## 32x32 SVM: test clean vs 20% augmented test
Applies scenarios A/B/C to 20% of the test set (A:0.4, B:0.4, C:0.2) and evaluates the 32x32 SVM.

In [5]:
from utils.pipeline_utils import scenarioA, scenarioB, scenarioC

AUG_RATIO_TEST = 0.20
AUG_DIST_TEST = {"A": 0.4, "B": 0.4, "C": 0.2}

scenario_map = {
    "A": scenarioA,
    "B": scenarioB,
    "C": scenarioC,
}

def apply_test_augmentation(X, aug_ratio, aug_dist):
    X_aug = X.copy()
    n_aug = int(len(X_aug) * aug_ratio)
    aug_indices = np.random.choice(len(X_aug), n_aug, replace=False)

    counts = {"A": 0, "B": 0, "C": 0}
    for idx in aug_indices:
        r = np.random.rand()
        if r < aug_dist["A"]:
            scenario_name = "A"
        elif r < aug_dist["A"] + aug_dist["B"]:
            scenario_name = "B"
        else:
            scenario_name = "C"

        counts[scenario_name] += 1
        img_tensor = torch.from_numpy(X_aug[idx]).float()
        aug_img = scenario_map[scenario_name](img_tensor)
        X_aug[idx] = aug_img.numpy()

    return X_aug, counts

if 32 not in models_by_size:
    raise RuntimeError("Run the size sweep above before this cell.")

np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

model_32 = models_by_size[32]["model"]
scaler_32 = models_by_size[32]["scaler"]
X_test_32 = models_by_size[32]["X_test_np"]
y_test_32 = models_by_size[32]["y_test_np"]

X_test_32_aug, aug_counts = apply_test_augmentation(
    X_test_32,
    aug_ratio=AUG_RATIO_TEST,
    aug_dist=AUG_DIST_TEST,
)

X_test_32_color = color_hist_features(X_test_32, bins=color_bins, img_shape=(3, 32, 32))
X_test_32_aug_color = color_hist_features(X_test_32_aug, bins=color_bins, img_shape=(3, 32, 32))

X_test_32_std = scaler_32.transform(X_test_32_color)
X_test_32_aug_std = scaler_32.transform(X_test_32_aug_color)

y_test_pred_clean = model_32.predict(X_test_32_std)
y_test_pred_aug = model_32.predict(X_test_32_aug_std)

acc_test_clean = accuracy_score(y_test_32, y_test_pred_clean)
acc_test_aug = accuracy_score(y_test_32, y_test_pred_aug)

print(f"32x32 test accuracy (clean): {acc_test_clean:.4f}")
print(f"32x32 test accuracy (20% augmented): {acc_test_aug:.4f}")
print(f"Augmentation distribution: {aug_counts}")


32x32 test accuracy (clean): 0.9837
32x32 test accuracy (20% augmented): 0.8852
Augmentation distribution: {'A': 3456, 'B': 3485, 'C': 1747}
