## Check length in each folder

In [1]:
import os
import numpy as np
from tqdm import tqdm
from PIL import Image
import pillow_heif
import sys

# Get the project root
project_root = os.path.abspath("..")    # Go up one level from "notebooks/"
sys.path.append(project_root)           # Add the root to the path

from src.data.process_data import *

In [2]:
# --> Check length of the folders of the train, test and val

# Get paths
train_directory_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'train')
test_directory_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'test')
val_directory_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'val')

# Get folders
train_folders = os.listdir(train_directory_path)
test_folders = os.listdir(test_directory_path)
val_folders = os.listdir(val_directory_path)

# Dictionarys
student_dict = {}
student_dict_with_less_than_200 = {}

for raw_folder in train_folders:
    student_dict[raw_folder] = len(os.listdir(os.path.join(train_directory_path, raw_folder)))

for raw_folder in test_folders:
    student_dict[raw_folder] += len(os.listdir(os.path.join(test_directory_path, raw_folder)))

for raw_folder in val_folders:
    student_dict[raw_folder] += len(os.listdir(os.path.join(val_directory_path, raw_folder)))

print(f" Lenght: {len(student_dict)} ".center(50, "="))
for key, value in student_dict.items():
    if value < 200:
        student_dict_with_less_than_200[key] = value
        print(f"[-] {key} - {value}")

[-] armando_garcia - 195
[-] armando_islas - 103
[-] isaac_saenz - 180
[-] jose_pina - 172
[-] marlene_vazquez - 123
[-] oscar_espinosa_berrueco - 199
[-] rafael_diaz - 72
[-] sergio_gutierrez - 131
[-] uriel_martinez - 126


In [3]:
import shutil

raw_directory_path = os.path.join(os.getcwd(), '../', 'data', 'raw')
raw_folders = os.listdir(raw_directory_path)

print(f" Length of folders: {len(raw_folders)} ".center(50, "="))

for raw_folder in raw_folders:
    folder_path = os.path.join(raw_directory_path, raw_folder)
    images = os.listdir(folder_path)

    if len(images) >= 200 and raw_folder in student_dict_with_less_than_200.keys():
        print(f"[+] Folder: {raw_folder} - {len(images)} images")
    else:
        # remove the folder
        try:
            shutil.rmtree(folder_path)
        except Exception as e:
            print(f"[+] Error: {e}")



## Change folders names

In [13]:
# Get names of folders in the directory
raw_directory_path = os.path.join(os.getcwd(), '../', 'data', 'raw')
raw_folders = os.listdir(raw_directory_path)

In [14]:
normalize_folders = [normalize_folder_name(folder) for folder in raw_folders]

for raw_folder, new_folder in zip(raw_folders, normalize_folders):
    os.rename(os.path.join(raw_directory_path, raw_folder), os.path.join(raw_directory_path, new_folder))

## Check image format

In [15]:
# Available image formats
pillow_heif.register_heif_opener()

# Get paths
raw_directory_path = os.path.join(os.getcwd(), '../', 'data', 'raw')
raw_folders = os.listdir(raw_directory_path)

# Check image format in each folder
for raw_folder in raw_folders:
    folder_path = os.path.join(raw_directory_path, raw_folder)

    # Skip if it's not a folder
    if not os.path.isdir(folder_path):
        continue

    images = os.listdir(folder_path)
    for image_name in images:
        image_path = os.path.join(folder_path, image_name)
        try:
            with Image.open(image_path) as img:
                pass
                # print(f"[+] Image '{image_name}' in folder '{folder}' is readable.")
        except Exception as e:
            print(f"[-] Failed to read image '{image_name}' in folder '{raw_folder}': {e}")


## Rename and rezise

In [16]:
# Get names of folders in the directory

raw_directory_path = os.path.join(os.getcwd(), '../', 'data', 'raw')
folders_names = os.listdir(raw_directory_path)

In [17]:
for raw_folder in folders_names:
    # --> Read images in the folder
    folder_path = os.path.join(raw_directory_path, raw_folder)
    if not os.path.isdir(folder_path):
        continue

    images = os.listdir(folder_path)
    counter = 1

    for image in tqdm(images, desc=f"Procesando imágenes de {raw_folder}".center(40, "=")):
        image_path = os.path.join(folder_path, image)
        try:
            # --> Open image
            with Image.open(image_path) as img:

                # --> Resize image
                resized_img = resize_with_padding(img, 224)

                # --> Define new name and path
                new_name = f"{raw_folder}_{counter}.jpg"
                new_path = os.path.join(folder_path, new_name)

                # --> Save image
                resized_img.save(new_path, "JPEG")

            # --> Remove old image
            if image != new_name:
                os.remove(image_path)
            counter += 1
        except Exception as e:
            print(f"[-]Error en la imagen {image}: {e}")

===Procesando imágenes de galo_ayala====: 100%|██████████| 175/175 [00:32<00:00,  5.45it/s]
===Procesando imágenes de yahir_arias===: 100%|██████████| 205/205 [00:10<00:00, 19.23it/s]


## Split images into train and val

In [18]:
from sklearn.model_selection import train_test_split

for raw_folder in tqdm(raw_folders, desc=f"Enviando imágenes de {raw_folder}"):
    folder_path = os.path.join(raw_directory_path, raw_folder)
    images = os.listdir(folder_path)

    # Split the images
    train, temp = train_test_split(images, test_size=0.2, random_state=42)

    # Split the temp into validation and test
    val, test = train_test_split(temp, test_size=0.5, random_state=42)

    # Create the path to train
    train_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'train', raw_folder)
    os.makedirs(train_path, exist_ok=True)

    # Create the path to validation
    val_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'val', raw_folder)
    os.makedirs(val_path, exist_ok=True)

    # Create the path to test
    test_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'test', raw_folder)
    os.makedirs(test_path, exist_ok=True)

    # Move the images to folder train
    for image in train:
        image_path = os.path.join(folder_path, image)
        new_path = os.path.join(train_path, image)
        os.rename(image_path, new_path)

    # Move the images to folder val
    for image in val:
        image_path = os.path.join(folder_path, image)
        new_path = os.path.join(val_path, image)
        os.rename(image_path, new_path)

    # Move the images to folder test
    for image in test:
        image_path = os.path.join(folder_path, image)
        new_path = os.path.join(test_path, image)
        os.rename(image_path, new_path)

    # Delete the folder
    os.rmdir(folder_path)

Enviando imágenes de yahir_arias: 100%|██████████| 2/2 [00:00<00:00,  4.83it/s]


In [19]:
# See distribution of images in each folder
test_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'test')
train_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'train')
val_path = os.path.join(os.getcwd(), '../', 'data', 'final', 'val')

test_folders = os.listdir(test_path)
train_folders = os.listdir(train_path)
val_folders = os.listdir(val_path)

print(f"[+] Length of folders: {len(train_folders)}".center(50, "="))
for raw_folder in train_folders:
    folder_path = os.path.join(train_path, raw_folder)
    images = os.listdir(folder_path)
    print(f"Train - {raw_folder}: {len(images)} images")

Train - alexia_cruces: 168 images
Train - armando_garcia: 156 images
Train - armando_islas: 82 images
Train - carlos_aguilar: 160 images
Train - cinthya_sanchez: 161 images
Train - daniela_flores: 160 images
Train - diego_rodriguez: 162 images
Train - ernesto_rosales: 161 images
Train - evelyn_escudero: 162 images
Train - fernando_carmona: 164 images
Train - galo_ayala: 140 images
Train - gerardo_martinez: 160 images
Train - isaac_saenz: 144 images
Train - ismael_arista: 180 images
Train - jessica_juarez: 160 images
Train - jesus_soria: 160 images
Train - jorge_orozco: 160 images
Train - jose_pina: 137 images
Train - marlene_vazquez: 98 images
Train - mauricio_cortes: 160 images
Train - natalia_anaya: 160 images
Train - oscar_espinosa_berrueco: 159 images
Train - rafael_diaz: 57 images
Train - romario_reyes: 160 images
Train - santiago_barranco: 160 images
Train - sergio_gutierrez: 104 images
Train - uriel_martinez: 100 images
Train - yahir_arias: 164 images


In [20]:
for raw_folder in test_folders:
    folder_path = os.path.join(test_path, raw_folder)
    images = os.listdir(folder_path)
    print(f"Test - {raw_folder}: {len(images)} images")

Test - alexia_cruces: 22 images
Test - armando_garcia: 20 images
Test - armando_islas: 11 images
Test - carlos_aguilar: 20 images
Test - cinthya_sanchez: 21 images
Test - daniela_flores: 21 images
Test - diego_rodriguez: 21 images
Test - ernesto_rosales: 21 images
Test - evelyn_escudero: 21 images
Test - fernando_carmona: 21 images
Test - galo_ayala: 18 images
Test - gerardo_martinez: 20 images
Test - isaac_saenz: 18 images
Test - ismael_arista: 23 images
Test - jessica_juarez: 20 images
Test - jesus_soria: 20 images
Test - jorge_orozco: 20 images
Test - jose_pina: 18 images
Test - marlene_vazquez: 13 images
Test - mauricio_cortes: 21 images
Test - natalia_anaya: 21 images
Test - oscar_espinosa_berrueco: 20 images
Test - rafael_diaz: 8 images
Test - romario_reyes: 20 images
Test - santiago_barranco: 20 images
Test - sergio_gutierrez: 14 images
Test - uriel_martinez: 13 images
Test - yahir_arias: 21 images


In [21]:
for raw_folder in val_folders:
    folder_path = os.path.join(val_path, raw_folder)
    images = os.listdir(folder_path)
    print(f"Val - {raw_folder}: {len(images)} images")

Val - alexia_cruces: 21 images
Val - armando_garcia: 19 images
Val - armando_islas: 10 images
Val - carlos_aguilar: 20 images
Val - cinthya_sanchez: 20 images
Val - daniela_flores: 20 images
Val - diego_rodriguez: 20 images
Val - ernesto_rosales: 20 images
Val - evelyn_escudero: 20 images
Val - fernando_carmona: 21 images
Val - galo_ayala: 17 images
Val - gerardo_martinez: 20 images
Val - isaac_saenz: 18 images
Val - ismael_arista: 23 images
Val - jessica_juarez: 20 images
Val - jesus_soria: 20 images
Val - jorge_orozco: 20 images
Val - jose_pina: 17 images
Val - marlene_vazquez: 12 images
Val - mauricio_cortes: 20 images
Val - natalia_anaya: 20 images
Val - oscar_espinosa_berrueco: 20 images
Val - rafael_diaz: 7 images
Val - romario_reyes: 20 images
Val - santiago_barranco: 20 images
Val - sergio_gutierrez: 13 images
Val - uriel_martinez: 13 images
Val - yahir_arias: 20 images
