In [None]:
# fichier: data_prep_lfw.py
import os
import random
from PIL import Image
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_lfw_people
import matplotlib.pyplot as plt

IMAGE_SIZE = (160, 160)  # ou (224, 224)

def preprocess_image(image_array, image_size=IMAGE_SIZE):
    """Redimensionne et convertit une image numpy (RGB)"""
    img = Image.fromarray(image_array.astype(np.uint8))
    img = img.resize(image_size)
    arr = np.asarray(img).astype(np.uint8)
    return arr

def make_pairs(images, labels, n_same_per_person=5, n_diff_pairs=5000, seed=42):
    """Crée des paires (même personne / personnes différentes) à partir des images et labels"""
    random.seed(seed)
    same_pairs = []
    diff_pairs = []

    # Grouper les images par personne
    people = {}
    for idx, person in enumerate(labels):
        people.setdefault(person, []).append(images[idx])

    # same-person pairs
    for person, imgs in people.items():
        if len(imgs) < 2:
            continue
        combos = []
        for i in range(len(imgs)):
            for j in range(i+1, len(imgs)):
                combos.append((imgs[i], imgs[j]))
        random.shuffle(combos)
        for pair in combos[:n_same_per_person]:
            same_pairs.append((pair[0], pair[1], 1))

    # diff-person pairs
    people_list = list(people.keys())
    while len(diff_pairs) < n_diff_pairs:
        p1, p2 = random.sample(people_list, 2)
        a = random.choice(people[p1])
        b = random.choice(people[p2])
        diff_pairs.append((a, b, 0))

    pairs = same_pairs + diff_pairs
    random.shuffle(pairs)
    return pairs

if __name__ == '__main__':
    # Charger le dataset depuis sklearn
    lfw = fetch_lfw_people(color=True, resize=1.0, min_faces_per_person=2, download_if_missing=True)
    print("Shape des images:", lfw.images.shape)
    print("Nombre total d'images:", len(lfw.images))
    print("Nombre total de personnes:", len(lfw.target_names))
    print("Exemples de noms:", lfw.target_names[:10])

    # Prétraitement
    print("Prétraitement des images...")
    images = np.array([preprocess_image(img) for img in lfw.images])
    labels = lfw.target  # indices des personnes

    # Génération des paires
    pairs = make_pairs(images, labels, n_same_per_person=5, n_diff_pairs=5000)
    print(f"{len(pairs)} paires générées au total")


Shape des images: (9164, 125, 94, 3)
Nombre total d'images: 9164
Nombre total de personnes: 1680
Exemples de noms: ['Aaron Peirsol' 'Aaron Sorkin' 'Abdel Nasser Assidi' 'Abdoulaye Wade'
 'Abdullah' 'Abdullah Gul' 'Abdullah al-Attiyah' 'Abdullatif Sener'
 'Abel Pacheco' 'Abid Hamid Mahmud Al-Tikriti']
Prétraitement des images...
9702 paires générées au total


In [6]:
lfw.target 

array([ 127,  353,  392, ...,  905,  160, 1329], shape=(9164,))

In [16]:
import numpy as np

# Compter le nombre d'occurrences de l'index 127
count = np.sum(lfw.target == 163)

print(f"Nombre d'images pour la personne avec l'index 127 : {count}")


Nombre d'images pour la personne avec l'index 127 : 9


In [17]:
lfw.data

array([[0.5019608 , 0.37254903, 0.29803923, ..., 0.3764706 , 0.41960785,
        0.5058824 ],
       [0.2       , 0.1254902 , 0.13333334, ..., 0.21960784, 0.25490198,
        0.28235295],
       [0.18431373, 0.15686275, 0.09411765, ..., 0.41960785, 0.17254902,
        0.13725491],
       ...,
       [0.28627452, 0.2       , 0.10588235, ..., 0.54509807, 0.6       ,
        0.6       ],
       [0.8980392 , 0.57254905, 0.45490196, ..., 0.7529412 , 0.7490196 ,
        0.7411765 ],
       [0.46666667, 0.28235295, 0.21176471, ..., 0.6431373 , 0.7490196 ,
        0.8156863 ]], shape=(9164, 35250), dtype=float32)