In [6]:
import os
import pickle
import numpy as np
from PIL import Image
from tqdm import tqdm
import random

In [7]:
DATASET_PATH = "../data"
IMAGE_SIZE = (128, 128)
MAX_IMAGES_PER_CLASS = 1500
OUTPUT_PATH = "dataset.pkl"
SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}

In [8]:
def load_and_resize_image(image_path, target_size):
    try:
        with Image.open(image_path) as img:
            if img.mode != 'RGB':
                img = img.convert('RGB')
            img_resized = img.resize(target_size, Image.Resampling.LANCZOS)
            return np.array(img_resized)
    except Exception as e:
        print(f"Error: {image_path}: {e}")
        return None

def get_image_files(folder_path):
    image_files = []
    for file in os.listdir(folder_path):
        if os.path.splitext(file.lower())[1] in SUPPORTED_EXTENSIONS:
            image_files.append(os.path.join(folder_path, file))
    return image_files

def process_dataset():
    class_folders = [item for item in os.listdir(DATASET_PATH) 
                    if os.path.isdir(os.path.join(DATASET_PATH, item))]
    class_folders.sort()    
    print(f"\nClasses found: {class_folders}")
    X = []
    y = []

    for class_name in class_folders:
        print(f"\nProcessing: {class_name}")
        class_path = os.path.join(DATASET_PATH, class_name)
        image_files = get_image_files(class_path)
        if len(image_files) > MAX_IMAGES_PER_CLASS:
            image_files = random.sample(image_files, MAX_IMAGES_PER_CLASS)
        for img_path in tqdm(image_files, desc=f"Processing {class_name}"):
            img_array = load_and_resize_image(img_path, IMAGE_SIZE)
            if img_array is not None:
                X.append(img_array)
                y.append(class_name)
        print(f"  {len([f for f in image_files if load_and_resize_image(f, IMAGE_SIZE) is not None])} images added !")
    
    X = np.array(X)
    y = np.array(y)
    
    print(f"\nResult:")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    print(f"Classes: {len(class_folders)}")
    print(f"Example y: {y[:5]}")
    
    return X, y, class_folders

In [9]:
def save_dataset(X, y, class_names, output_path):
    dataset = {
        'X': X,
        'y': y,
        'class_names': class_names
    }
    with open(output_path, 'wb') as f:
        pickle.dump(dataset, f)
    file_size = os.path.getsize(output_path) / (1024**2)
    print(f"\nDataset saved: {output_path} ({file_size:.2f} MB)")

In [10]:
if __name__ == "__main__":
    random.seed(42)
    np.random.seed(42)

    print("🚀 Début du traitement...")
    X, y, class_names = process_dataset()
    save_dataset(X, y, class_names, OUTPUT_PATH)
    print(f"\n✅ Terminé!")
    print(f"📁 Dataset: {OUTPUT_PATH}")

🚀 Début du traitement...

Classes found: ['butterfly', 'cat', 'chicken', 'cow', 'dog', 'elephant', 'horse', 'sheep', 'spider', 'squirrel']

Processing: butterfly


Processing butterfly:   0%|          | 0/1500 [00:00<?, ?it/s]

Processing butterfly: 100%|██████████| 1500/1500 [00:05<00:00, 292.94it/s]


  1500 images added !

Processing: cat


Processing cat: 100%|██████████| 1500/1500 [00:10<00:00, 137.39it/s]


  1500 images added !

Processing: chicken


Processing chicken: 100%|██████████| 1500/1500 [00:02<00:00, 581.85it/s]


  1500 images added !

Processing: cow


Processing cow: 100%|██████████| 1500/1500 [00:02<00:00, 645.86it/s]


  1500 images added !

Processing: dog


Processing dog: 100%|██████████| 1500/1500 [00:02<00:00, 600.48it/s]


  1500 images added !

Processing: elephant


Processing elephant: 100%|██████████| 1446/1446 [00:04<00:00, 332.22it/s]


  1446 images added !

Processing: horse


Processing horse: 100%|██████████| 1500/1500 [00:02<00:00, 559.82it/s]


  1500 images added !

Processing: sheep


Processing sheep: 100%|██████████| 1500/1500 [00:05<00:00, 289.41it/s]


  1500 images added !

Processing: spider


Processing spider: 100%|██████████| 1500/1500 [00:02<00:00, 565.12it/s]


  1500 images added !

Processing: squirrel


Processing squirrel: 100%|██████████| 1500/1500 [00:02<00:00, 629.10it/s]


  1500 images added !

Result:
X shape: (14946, 128, 128, 3)
y shape: (14946,)
Classes: 10
Example y: ['butterfly' 'butterfly' 'butterfly' 'butterfly' 'butterfly']

Dataset saved: dataset.pkl (701.11 MB)

✅ Terminé!
📁 Dataset: dataset.pkl
