In [None]:
import os
import shutil
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as T

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)


<torch._C.Generator at 0x1761055d0>

In [2]:
ROOT_DIR = "dataset/fruit360"
TRAIN_DIR = os.path.join(ROOT_DIR, "Training")
TEST_DIR = os.path.join(ROOT_DIR, "Test")

GITHUB_REPO = "https://github.com/fruits-360/fruits-360-100x100"
CLONE_DIR = "dataset/fruits-360-100x100"

def download_dataset():
    os.makedirs("dataset", exist_ok=True)
    subprocess.run(["git", "clone", GITHUB_REPO, CLONE_DIR], check=True)
    
    os.makedirs(ROOT_DIR, exist_ok=True)
    shutil.move(os.path.join(CLONE_DIR, "Training"), TRAIN_DIR)
    shutil.move(os.path.join(CLONE_DIR, "Test"), TEST_DIR)
    shutil.rmtree(CLONE_DIR, ignore_errors=True)

if not os.path.exists(ROOT_DIR):
    download_dataset()

assert os.path.exists(TRAIN_DIR), f"{TRAIN_DIR} not found"
assert os.path.exists(TEST_DIR), f"{TEST_DIR} not found"
print(f"Dataset ready: {ROOT_DIR}")


Dataset ready: dataset/fruit360


In [3]:
class Fruit360FolderDataset(Dataset):
    def __init__(self, root_dir, transform=None, variety=False):
        self.root_dir = root_dir
        self.transform = transform
        self.variety = variety
        self.samples = []
        
        for class_name in sorted(os.listdir(root_dir)):
            class_dir = os.path.join(root_dir, class_name)
            if not os.path.isdir(class_dir):
                continue
            
            label = class_name if self.variety else class_name.split()[0]
            
            for img_name in os.listdir(class_dir):
                if img_name.lower().endswith((".jpg", ".png")):
                    self.samples.append((os.path.join(class_dir, img_name), label))
        
        unique_labels = sorted({lbl for _, lbl in self.samples})
        self.label_to_idx = {lbl: i for i, lbl in enumerate(unique_labels)}
        self.idx_to_label = {i: lbl for lbl, i in self.label_to_idx.items()}
        
        print(f"{os.path.basename(root_dir)}: {len(self.samples)} images, {len(unique_labels)} classes")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        img_path, label_str = self.samples[idx]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, self.label_to_idx[label_str]


In [None]:
def extract_numpy(loader):
    X_list, y_list = [], []
    for imgs, labels in loader:
        X_list.append(imgs.numpy())
        y_list.append(labels.numpy())
    X = np.concatenate(X_list, axis=0)
    y = np.concatenate(y_list, axis=0)
    return X, y


In [5]:
MODEL_DIR = "saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"Models will be saved in: {MODEL_DIR}")

Models will be saved in: saved_models


In [6]:
MODELS_CONFIG = {
    "8x8": {
        "size": 8,
        "n_components": 55,
        "C": 100,
        "gamma": 0.01,
    },
    "16x16": {
        "size": 16,
        "n_components": 158,
        "C": 100,
        "gamma": 0.001,
    },
    "32x32": {
        "size": 32,
        "n_components": 66,
        "C": 10,
        "gamma": 0.001,
    },
}

print("Model configurations loaded:")
for name, cfg in MODELS_CONFIG.items():
    print(f"  {name}: SIZE={cfg['size']}, PC={cfg['n_components']}, C={cfg['C']}, gamma={cfg['gamma']}")


Model configurations loaded:
  8x8: SIZE=8, PC=55, C=100, gamma=0.01
  16x16: SIZE=16, PC=158, C=100, gamma=0.001
  32x32: SIZE=32, PC=66, C=10, gamma=0.001


In [7]:
VARIETY = False
BATCH_SIZE = 100

for model_name, config in MODELS_CONFIG.items():
    print("\n" + "="*70)
    print(f"TRAINING AND SAVING: {model_name}")
    print("="*70)
    
    SIZE = config["size"]
    N_COMP = config["n_components"]
    C_VAL = config["C"]
    GAMMA_VAL = config["gamma"]
    
    transform = T.Compose([
        T.Resize((SIZE, SIZE)),
        T.ToTensor(),
    ])
    
    print(f"\nLoading dataset at {SIZE}x{SIZE}...")
    train_full = Fruit360FolderDataset(TRAIN_DIR, transform=transform, variety=VARIETY)
    test_dataset = Fruit360FolderDataset(TEST_DIR, transform=transform, variety=VARIETY)
    
    train_size = int(0.7 * len(train_full))
    val_size = len(train_full) - train_size
    train_dataset, val_dataset = random_split(
        train_full,
        [train_size, val_size],
        generator=torch.Generator().manual_seed(RANDOM_STATE)
    )
    
    print(f"Train {len(train_dataset)}, Val {len(val_dataset)}, Test {len(test_dataset)}")
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    print("Extracting numpy arrays...")
    X_train, y_train = extract_numpy(train_loader)
    X_val, y_val = extract_numpy(val_loader)
    X_test, y_test = extract_numpy(test_loader)
    
    X_train_flat = X_train.reshape(X_train.shape[0], -1)
    X_val_flat = X_val.reshape(X_val.shape[0], -1)
    X_test_flat = X_test.reshape(X_test.shape[0], -1)
    
    print(f"Flattened shape: {X_train_flat.shape}")
    
    print("Fitting StandardScaler...")
    scaler = StandardScaler()
    X_train_sc = scaler.fit_transform(X_train_flat)
    X_val_sc = scaler.transform(X_val_flat)
    X_test_sc = scaler.transform(X_test_flat)
    
    print(f"Fitting PCA with {N_COMP} components...")
    pca = PCA(n_components=N_COMP, random_state=RANDOM_STATE)
    
    X_train_val_sc = np.concatenate([X_train_sc, X_val_sc], axis=0)
    y_train_val = np.concatenate([y_train, y_val], axis=0)
    
    X_train_val_pca = pca.fit_transform(X_train_val_sc)
    X_test_pca = pca.transform(X_test_sc)
    
    print(f"Train+Val PCA shape: {X_train_val_pca.shape}")
    print(f"Test PCA shape: {X_test_pca.shape}")
    
    print(f"Training SVM (C={C_VAL}, gamma={GAMMA_VAL})...")
    svm = SVC(C=C_VAL, gamma=GAMMA_VAL, kernel='rbf', random_state=RANDOM_STATE)
    svm.fit(X_train_val_pca, y_train_val)
    
    y_test_pred = svm.predict(X_test_pca)
    test_acc = accuracy_score(y_test, y_test_pred)
    print(f"✓ Final test accuracy: {test_acc:.4f}")
    
    scaler_path = os.path.join(MODEL_DIR, f"scaler_{model_name}.joblib")
    pca_path = os.path.join(MODEL_DIR, f"pca_{model_name}.joblib")
    svm_path = os.path.join(MODEL_DIR, f"svm_{model_name}.joblib")
    
    joblib.dump(scaler, scaler_path)
    joblib.dump(pca, pca_path)
    joblib.dump(svm, svm_path)
    
    print(f"✓ Models saved:")
    print(f"  - {scaler_path}")
    print(f"  - {pca_path}")
    print(f"  - {svm_path}")

print("\n" + "="*70)
print("ALL MODELS TRAINED AND SAVED SUCCESSFULLY!")
print("="*70)



TRAINING AND SAVING: 8x8

Loading dataset at 8x8...
Training: 130344 images, 79 classes
Test: 43442 images, 79 classes
Train 91240, Val 39104, Test 43442
Extracting numpy arrays...
Flattened shape: (91240, 192)
Fitting StandardScaler...
Fitting PCA with 55 components...
Train+Val PCA shape: (130344, 55)
Test PCA shape: (43442, 55)
Training SVM (C=100, gamma=0.01)...
✓ Final test accuracy: 0.9794
✓ Models saved:
  - saved_models/scaler_8x8.joblib
  - saved_models/pca_8x8.joblib
  - saved_models/svm_8x8.joblib

TRAINING AND SAVING: 16x16

Loading dataset at 16x16...
Training: 130344 images, 79 classes
Test: 43442 images, 79 classes
Train 91240, Val 39104, Test 43442
Extracting numpy arrays...
Flattened shape: (91240, 768)
Fitting StandardScaler...
Fitting PCA with 158 components...
Train+Val PCA shape: (130344, 158)
Test PCA shape: (43442, 158)
Training SVM (C=100, gamma=0.001)...
✓ Final test accuracy: 0.9813
✓ Models saved:
  - saved_models/scaler_16x16.joblib
  - saved_models/pca_16x

In [8]:
print("Testing model loading...")

for model_name in MODELS_CONFIG.keys():
    scaler = joblib.load(os.path.join(MODEL_DIR, f"scaler_{model_name}.joblib"))
    pca = joblib.load(os.path.join(MODEL_DIR, f"pca_{model_name}.joblib"))
    svm = joblib.load(os.path.join(MODEL_DIR, f"svm_{model_name}.joblib"))
    
    print(f"✓ {model_name}: scaler, pca ({pca.n_components_} PC), svm loaded successfully")

print("\nAll models loaded correctly. Ready for robustness testing!")


Testing model loading...
✓ 8x8: scaler, pca (55 PC), svm loaded successfully
✓ 16x16: scaler, pca (158 PC), svm loaded successfully
✓ 32x32: scaler, pca (66 PC), svm loaded successfully

All models loaded correctly. Ready for robustness testing!
