## Check length in each folder

In [1]:
import os
import numpy as np
from tqdm import tqdm
from PIL import Image
import pillow_heif
import sys

# Get the project root
project_root = os.path.abspath("..")    # Go up one level from "notebooks/"
sys.path.append(project_root)           # Add the root to the path

from src.data.process_data import *

In [4]:
directory_path = os.path.join(os.getcwd(), '../', 'data', 'raw')
folders = os.listdir(directory_path)

print(f"Length of folders: {len(folders)}")

Length of folders: 27


In [3]:
for folder in folders:
    folder_path = os.path.join(directory_path, folder)
    images = os.listdir(folder_path)
    if len(images) >= 200:
        print(f"[+] Folder: {folder} - {len(images)} images")
    else:
        print(f"[-] Folder: {folder} - {len(images)} images")

[-] Folder: Armando García - 195 images
[-] Folder: Armando Islas - 103 images
[+] Folder: Carlos Aguilar - 200 images
[+] Folder: Cinthya Sánchez - 202 images
[+] Folder: Daniela Flores - 201 images
[-] Folder: Diego Rodriguez - 119 images
[+] Folder: Ernesto Rosales - 203 images
[+] Folder: Evelyn Escudero - 201 images
[+] Folder: Fernando Carmona - 202 images
[-] Folder: Galo Ayala - 175 images
[+] Folder: Gerardo Martínez - 200 images
[-] Folder: Isaac Saenz - 191 images
[+] Folder: Ismael Arista - 227 images
[+] Folder: JESSICA JUAREZ - 200 images
[+] Folder: Jesus Soria - 200 images
[+] Folder: Jorge Orozco - 200 images
[-] Folder: José Piña - 181 images
[-] Folder: Marlene Vazquez - 196 images
[+] Folder: Mauricio Cortes - 202 images
[+] Folder: Natalia Anaya - 201 images
[-] Folder: Oscar Espinosa Berrueco - 199 images
[-] Folder: Rafael Díaz - 72 images
[+] Folder: Romario Reyes - 200 images
[+] Folder: Santiago Barranco - 200 images
[-] Folder: Sergio Gutierrez - 197 images
[

## Change folders names

In [None]:
# Get names of folders in the directory
directory_path = os.path.join(os.getcwd(), '../', 'data', 'raw')
folders = os.listdir(directory_path)

In [11]:
normalize_folders = [normalize_folder_name(folder) for folder in folders]

for folder, new_folder in zip(folders, normalize_folders):
    os.rename(os.path.join(directory_path, folder), os.path.join(directory_path, new_folder))

## Check image format

In [13]:
# Available image formats
pillow_heif.register_heif_opener()

directory_path = os.path.join(os.getcwd(), 'images')
folders = os.listdir(directory_path)

# Check image format in each folder
for folder in folders:
    folder_path = os.path.join(directory_path, folder)

    # Skip if it's not a folder
    if not os.path.isdir(folder_path):
        continue

    images = os.listdir(folder_path)
    for image_name in images:
        image_path = os.path.join(folder_path, image_name)
        try:
            with Image.open(image_path) as img:
                pass
                # print(f"[+] Image '{image_name}' in folder '{folder}' is readable.")
        except Exception as e:
            print(f"[-] Failed to read image '{image_name}' in folder '{folder}': {e}")



## Rename and rezise

In [14]:
# Get names of folders in the directory

directory_path = os.path.join(os.getcwd(), 'images')
folders_images = os.listdir(directory_path)

In [15]:
def resize_with_padding(image, target_size):
    old_size = image.size  # (width, height) in Pillow
    ratio = float(target_size) / max(old_size)
    new_size = tuple([int(x * ratio) for x in old_size])

    resized_img = image.resize(new_size, Image.Resampling.LANCZOS)

    delta_w = target_size - new_size[0]
    delta_h = target_size - new_size[1]
    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
    left, right = delta_w // 2, delta_w - (delta_w // 2)

    # New image with white background
    new_img = Image.new("RGB", (target_size, target_size), (0, 0, 0))
    new_img.paste(resized_img, (left, top))
    return new_img

In [16]:
for folder in folders_images:
    # --> Read images in the folder
    folder_path = os.path.join(directory_path, folder)
    if not os.path.isdir(folder_path):
        continue

    images = os.listdir(folder_path)
    counter = 1

    for image in tqdm(images, desc=f"Procesando imágenes de {folder}"):
        image_path = os.path.join(folder_path, image)
        try:
            # --> Open image
            img = Image.open(image_path)

            # --> Resize image
            resized_img = resize_with_padding(img, 224)

            # --> Define new name and path
            new_name = f"{folder}_{counter}.jpg"
            new_path = os.path.join(folder_path, new_name)

            # --> Save image
            resized_img.save(new_path, "JPEG")

            # --> Remove old image
            if image != new_name:
                os.remove(image_path)
            counter += 1
        except Exception as e:
            print(f"Error en la imagen {image}: {e}")

Procesando imágenes de alexia_cruces: 100%|██████████| 211/211 [00:23<00:00,  9.05it/s]
Procesando imágenes de armando_garcia: 100%|██████████| 195/195 [00:48<00:00,  3.98it/s]
Procesando imágenes de armando_islas: 100%|██████████| 103/103 [00:06<00:00, 16.91it/s]
Procesando imágenes de carlos_aguilar: 100%|██████████| 200/200 [00:18<00:00, 10.89it/s]
Procesando imágenes de cinthya_sanchez: 100%|██████████| 202/202 [00:46<00:00,  4.31it/s]
Procesando imágenes de daniela_flores: 100%|██████████| 201/201 [00:35<00:00,  5.74it/s]
Procesando imágenes de diego_rodriguez: 100%|██████████| 119/119 [00:26<00:00,  4.46it/s]
Procesando imágenes de ernesto_rosales: 100%|██████████| 202/202 [00:43<00:00,  4.67it/s]
Procesando imágenes de evelyn_escudero: 100%|██████████| 203/203 [00:07<00:00, 26.57it/s]
Procesando imágenes de fernando_carmona: 100%|██████████| 206/206 [00:31<00:00,  6.47it/s]
Procesando imágenes de gerardo_martinez: 100%|██████████| 200/200 [00:54<00:00,  3.69it/s]
Procesando imág

## Split images into train and val

In [7]:
from sklearn.model_selection import train_test_split

for folder in tqdm(folders, desc=f"Enviando imágenes de {folder}"):
    folder_path = os.path.join(directory_path, folder)
    images = os.listdir(folder_path)

    # Split the images
    train, temp = train_test_split(images, test_size=0.2, random_state=42)

    # Split the temp into validation and test
    val, test = train_test_split(temp, test_size=0.5, random_state=42)

    # Create the path to train
    train_path = os.path.join(os.getcwd(), f'dataset\\train\\{folder}')
    os.makedirs(train_path, exist_ok=True)

    # Create the path to validation
    val_path = os.path.join(os.getcwd(), f'dataset\\val\\{folder}')
    os.makedirs(val_path, exist_ok=True)

    # Create the path to test
    test_path = os.path.join(os.getcwd(), f'dataset\\test\\{folder}')
    os.makedirs(test_path, exist_ok=True)

    # Move the images to folder train
    for image in train:
        image_path = os.path.join(folder_path, image)
        new_path = os.path.join(train_path, image)
        os.rename(image_path, new_path)

    # Move the images to folder val
    for image in val:
        image_path = os.path.join(folder_path, image)
        new_path = os.path.join(val_path, image)
        os.rename(image_path, new_path)

    # Move the images to folder test
    for image in test:
        image_path = os.path.join(folder_path, image)
        new_path = os.path.join(test_path, image)
        os.rename(image_path, new_path)

    # Delete the folder
    os.rmdir(folder_path)

Enviando imágenes de uriel_martinez: 100%|██████████| 25/25 [00:04<00:00,  5.21it/s]


In [13]:
# See distribution of images in each folder
test_path = os.path.join(os.getcwd(), 'dataset', 'test')
train_path = os.path.join(os.getcwd(), 'dataset', 'train')
val_path = os.path.join(os.getcwd(), 'dataset', 'val')

test_folders = os.listdir(test_path)
train_folders = os.listdir(train_path)
val_folders = os.listdir(val_path)

for folder in train_folders:
    folder_path = os.path.join(train_path, folder)
    images = os.listdir(folder_path)
    print(f"Train - {folder}: {len(images)} images")

Train - alexia_cruces: 168 images
Train - armando_garcia: 156 images
Train - armando_islas: 82 images
Train - carlos_aguilar: 160 images
Train - cinthya_sanchez: 161 images
Train - daniela_flores: 160 images
Train - diego_rodriguez: 95 images
Train - ernesto_rosales: 161 images
Train - evelyn_escudero: 162 images
Train - fernando_carmona: 164 images
Train - gerardo_martinez: 160 images
Train - isaac_saenz: 144 images
Train - ismael_arista: 52 images
Train - jessica_juarez: 160 images
Train - jorge_orozco: 160 images
Train - jose_pina: 137 images
Train - marlene_vazquez: 98 images
Train - mauricio_cortes: 93 images
Train - natalia_anaya: 124 images
Train - oscar_espinosa_berrueco: 159 images
Train - rafael_diaz: 57 images
Train - romario_reyes: 160 images
Train - santiago_barranco: 160 images
Train - sergio_gutierrez: 104 images
Train - uriel_martinez: 100 images


In [14]:
for folder in test_folders:
    folder_path = os.path.join(test_path, folder)
    images = os.listdir(folder_path)
    print(f"Test - {folder}: {len(images)} images")

Test - alexia_cruces: 22 images
Test - armando_garcia: 20 images
Test - armando_islas: 11 images
Test - carlos_aguilar: 20 images
Test - cinthya_sanchez: 21 images
Test - daniela_flores: 21 images
Test - diego_rodriguez: 12 images
Test - ernesto_rosales: 21 images
Test - evelyn_escudero: 21 images
Test - fernando_carmona: 21 images
Test - gerardo_martinez: 20 images
Test - isaac_saenz: 18 images
Test - ismael_arista: 7 images
Test - jessica_juarez: 20 images
Test - jorge_orozco: 20 images
Test - jose_pina: 18 images
Test - marlene_vazquez: 13 images
Test - mauricio_cortes: 12 images
Test - natalia_anaya: 16 images
Test - oscar_espinosa_berrueco: 20 images
Test - rafael_diaz: 8 images
Test - romario_reyes: 20 images
Test - santiago_barranco: 20 images
Test - sergio_gutierrez: 14 images
Test - uriel_martinez: 13 images


In [15]:
for folder in val_folders:
    folder_path = os.path.join(val_path, folder)
    images = os.listdir(folder_path)
    print(f"Val - {folder}: {len(images)} images")

Val - alexia_cruces: 21 images
Val - armando_garcia: 19 images
Val - armando_islas: 10 images
Val - carlos_aguilar: 20 images
Val - cinthya_sanchez: 20 images
Val - daniela_flores: 20 images
Val - diego_rodriguez: 12 images
Val - ernesto_rosales: 20 images
Val - evelyn_escudero: 20 images
Val - fernando_carmona: 21 images
Val - gerardo_martinez: 20 images
Val - isaac_saenz: 18 images
Val - ismael_arista: 7 images
Val - jessica_juarez: 20 images
Val - jorge_orozco: 20 images
Val - jose_pina: 17 images
Val - marlene_vazquez: 12 images
Val - mauricio_cortes: 12 images
Val - natalia_anaya: 16 images
Val - oscar_espinosa_berrueco: 20 images
Val - rafael_diaz: 7 images
Val - romario_reyes: 20 images
Val - santiago_barranco: 20 images
Val - sergio_gutierrez: 13 images
Val - uriel_martinez: 13 images
