In [1]:
import os
import random
import shutil
from pathlib import Path

# Configuration
data_dir = Path("hg14-handgesture14-dataset/versions/1/HG14/HG14-Hand Gesture")
output_dir = Path("Dataset")
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1
random_seed = 42
random.seed(random_seed)

# Création des répertoires
for split in ["train", "val", "test"]:
    (output_dir / split).mkdir(parents=True, exist_ok=True)

# Parcours des classes
for class_dir in data_dir.iterdir():
    if not class_dir.is_dir():
        continue
    
    class_name = class_dir.name
    print(f"Traitement de la classe: {class_name}")
    
    # Liste et mélange des images
    images = list(class_dir.glob("*.*"))  # Tous les fichiers
    random.shuffle(images)
    
    total_images = len(images)
    train_end = int(total_images * train_ratio)
    val_end = train_end + int(total_images * val_ratio)
    
    train_images = images[:train_end]
    val_images = images[train_end:val_end]
    test_images = images[val_end:]
    
    # Vérification de l'absence d'overlap
    assert len(set(train_images) & set(val_images)) == 0
    assert len(set(train_images) & set(test_images)) == 0
    assert len(set(val_images) & set(test_images)) == 0
    
    # Création des sous-répertoires
    for split in ["train", "val", "test"]:
        (output_dir / split / class_name).mkdir(exist_ok=True)

    # Copie avec renommage
    for idx, img in enumerate(train_images):
        new_name = f"{class_name}_train_{idx:04d}{img.suffix}"
        shutil.copy(img, output_dir / "train" / class_name / new_name)
    
    for idx, img in enumerate(val_images):
        new_name = f"{class_name}_val_{idx:04d}{img.suffix}"
        shutil.copy(img, output_dir / "val" / class_name / new_name)
    
    for idx, img in enumerate(test_images):
        new_name = f"{class_name}_test_{idx:04d}{img.suffix}"
        shutil.copy(img, output_dir / "test" / class_name / new_name)

# Vérification finale
def count_files(path):
    return sum(len(files) for _, _, files in os.walk(path))

print("\nVérification finale:")
print(f"Total images train: {count_files(output_dir/'train')}")
print(f"Total images val: {count_files(output_dir/'val')}")
print(f"Total images test: {count_files(output_dir/'test')}")
print(f"Total combiné: {count_files(output_dir/'train') + count_files(output_dir/'val') + count_files(output_dir/'test')}")
print(f"Total original: {count_files(data_dir)}")

# Vérification d'overlap (optionnel)
all_train = set(output_dir.glob("train/**/*"))
all_val = set(output_dir.glob("val/**/*"))
all_test = set(output_dir.glob("test/**/*"))
overlap = len(all_train & all_val) + len(all_train & all_test) + len(all_val & all_test)
print(f"\nOverlap détecté: {overlap} fichiers")


Traitement de la classe: Gesture_0
Traitement de la classe: Gesture_1
Traitement de la classe: Gesture_10
Traitement de la classe: Gesture_11
Traitement de la classe: Gesture_12
Traitement de la classe: Gesture_13
Traitement de la classe: Gesture_2
Traitement de la classe: Gesture_3
Traitement de la classe: Gesture_4
Traitement de la classe: Gesture_5
Traitement de la classe: Gesture_6
Traitement de la classe: Gesture_7
Traitement de la classe: Gesture_8
Traitement de la classe: Gesture_9

Vérification finale:
Total images train: 9800
Total images val: 2800
Total images test: 1400
Total combiné: 14000
Total original: 14000

Overlap détecté: 0 fichiers


In [4]:
def split_dataset(
    source_dir: str,
    dest_dir: str,
    train_ratio: float = 0.8,
    seed: int = 42,
    img_extensions: tuple = (".jpg", ".jpeg", ".png", ".bmp")
):
    """
    Split un dataset en train/validation de manière stricte sans overlap
    
    Args:
        source_dir: Chemin vers le dataset original (avec sous-dossiers par classe)
        dest_dir: Répertoire de destination parent
        train_ratio: Proportion pour l'entraînement (default: 0.8)
        seed: Seed aléatoire pour reproductibilité (default: 42)
        img_extensions: Extensions de fichiers à considérer comme images
    """
    
    # Initialisation
    random.seed(seed)
    source_path = Path(source_dir)
    dest_path = Path(dest_dir)
    
    # Création des répertoires
    (dest_path / "train").mkdir(parents=True, exist_ok=True)
    (dest_path / "val").mkdir(parents=True, exist_ok=True)
    
    # Parcours des classes
    for class_dir in source_path.iterdir():
        if not class_dir.is_dir():
            continue
            
        class_name = class_dir.name
        print(f"Traitement de la classe: {class_name}")
        
        # Lister uniquement les images
        images = [f for f in class_dir.iterdir() if f.suffix.lower() in img_extensions]
        random.shuffle(images)
        
        # Calcul du split
        split_idx = int(len(images) * train_ratio)
        if split_idx == 0 or split_idx == len(images):
            raise ValueError(f"Classe {class_name} a trop peu d'images: {len(images)}")
        
        # Création des sous-répertoires
        train_class_dir = dest_path / "train" / class_name
        val_class_dir = dest_path / "val" / class_name
        train_class_dir.mkdir(exist_ok=True)
        val_class_dir.mkdir(exist_ok=True)
        
        # Copie des fichiers avec vérification
        for img in images[:split_idx]:
            shutil.copy(img, train_class_dir / img.name)
        
        for img in images[split_idx:]:
            shutil.copy(img, val_class_dir / img.name)
            
    # Validation finale
    print("\nValidation du split:")
    total_source = sum(1 for _ in source_path.glob("*/*"))
    total_train = sum(1 for _ in (dest_path/"train").glob("*/*"))
    total_val = sum(1 for _ in (dest_path/"val").glob("*/*"))
    
    print(f"Total original: {total_source}")
    print(f"Total train: {total_train}")
    print(f"Total val: {total_val}")
    print(f"Rapport train/val: {total_train/(total_train+total_val):.2%}")
    
    # Vérification d'overlap
    train_files = {f.name for f in (dest_path/"train").glob("*/*")}
    val_files = {f.name for f in (dest_path/"val").glob("*/*")}
    common = train_files & val_files
    if common:
        raise RuntimeError(f"Overlap détecté! {len(common)} fichiers en commun")
    else:
        print("✓ Aucun fichier en commun entre train et val")


In [5]:
split_dataset(
    source_dir="hg14-handgesture14-dataset/versions/1/HG14/HG14-Hand Gesture",
    dest_dir="Dataset",
    train_ratio=0.8,
    seed=42
)

Traitement de la classe: Gesture_0
Traitement de la classe: Gesture_1
Traitement de la classe: Gesture_10
Traitement de la classe: Gesture_11
Traitement de la classe: Gesture_12
Traitement de la classe: Gesture_13
Traitement de la classe: Gesture_2
Traitement de la classe: Gesture_3
Traitement de la classe: Gesture_4
Traitement de la classe: Gesture_5
Traitement de la classe: Gesture_6
Traitement de la classe: Gesture_7
Traitement de la classe: Gesture_8
Traitement de la classe: Gesture_9

Validation du split:
Total original: 14000
Total train: 11200
Total val: 2800
Rapport train/val: 80.00%
✓ Aucun fichier en commun entre train et val


In [3]:
# Exemple de vérification d'overlap
train_files = set(output_dir.glob("train/**/*.*"))
val_files = set(output_dir.glob("val/**/*.*"))
print(f"Images en commun : {len(train_files & val_files)}")

Images en commun : 0


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gulerosman/hg14-handgesture14-dataset")

print("Path to dataset files:", path)