Generar el Split para el mendeley_dataset

In [6]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

from tensorflow.keras.preprocessing.image import ImageDataGenerator # type: ignore
from sklearn.model_selection import train_test_split

from src.config import *
from src.data import *
from src.models.efficientnet import EfficientNetB5Custom
from src.utils import *
from src.data import OriginalOAIDataset
from src.train import train, train_model
from src.trainers.classification import Classification


np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)



<torch._C.Generator at 0x1cca6ffead0>

In [4]:
ORIGINAL_PATH = MENDELEY_EXPERT1_PATH
NEW_MENDELEY_EXPERT1_PATH = 'dataset/mendeley_dataset/medical_expert_1_split'

In [5]:

# Crear split para el dataset 70 10 20

if not os.path.exists(NEW_MENDELEY_EXPERT1_PATH):
    os.makedirs(NEW_MENDELEY_EXPERT1_PATH)

NEW_TRAIN_PATH = os.path.join(NEW_MENDELEY_EXPERT1_PATH, 'train')
NEW_VAL_PATH = os.path.join(NEW_MENDELEY_EXPERT1_PATH, 'val')
NEW_TEST_PATH = os.path.join(NEW_MENDELEY_EXPERT1_PATH, 'test')

for split in ['train', 'val', 'test']:
    split_dir = os.path.join(NEW_MENDELEY_EXPERT1_PATH, split)
    if not os.path.exists(split_dir):
        os.makedirs(split_dir)
    for c in os.listdir(ORIGINAL_PATH):
        class_split_dir = os.path.join(split_dir, c)
        if not os.path.exists(class_split_dir):
            os.makedirs(class_split_dir)



In [None]:
classes = os.listdir(ORIGINAL_PATH)

for c in classes:
    class_dir = os.path.join(ORIGINAL_PATH, c)

    images = os.listdir(class_dir)
    n_images = len(images)
    n_train = int(n_images * 0.7)
    n_val = int(n_images * 0.1)
    n_test = n_images - n_train - n_val

    print("Generando split para la clase", c)
    print("Train:", n_train)
    print("Val:", n_val)
    print("Test:", n_test)

    train_images, val_test_images = train_test_split(images, test_size=n_val + n_test)
    val_images, test_images = train_test_split(val_test_images, test_size=n_test)

    for img in train_images:
        shutil.move(os.path.join(class_dir, img), os.path.join(NEW_MENDELEY_EXPERT1_PATH, 'train', c, img))
    for img in val_images:
        shutil.move(os.path.join(class_dir, img), os.path.join(NEW_MENDELEY_EXPERT1_PATH, 'val', c, img))
    for img in test_images:
        shutil.move(os.path.join(class_dir, img), os.path.join(NEW_MENDELEY_EXPERT1_PATH, 'test', c, img))


Generando split para la clase 0
Train: 359
Val: 51
Test: 104
Train: 359
Val: 51
Test: 104
Generando split para la clase 1
Train: 333
Val: 47
Test: 97
Train: 333
Val: 47
Test: 97
Generando split para la clase 2
Train: 162
Val: 23
Test: 47
Train: 162
Val: 23
Test: 47
Generando split para la clase 3
Train: 154
Val: 22
Test: 45
Train: 154
Val: 22
Test: 45
Generando split para la clase 4
Train: 144
Val: 20
Test: 42
Train: 144
Val: 20
Test: 42


In [None]:

if not os.path.exists(NEW_OAI_DATASET):
    os.makedirs(NEW_OAI_DATASET)
TRAIN_PATH = os.path.join(NEW_OAI_DATASET, 'train')
if not os.path.exists(TRAIN_PATH):
    os.makedirs(TRAIN_PATH)
for class_name in classes:
    class_dir = os.path.join(ORIGINAL_TRAIN_PATH, class_name)
    CLASS_PATH = os.path.join(TRAIN_PATH, class_name)
    if not os.path.exists(CLASS_PATH):
        os.makedirs(CLASS_PATH)

    num_augmentations = 1000 - len(os.listdir(class_dir))
    print(f"Copiando imagenes de la clase {class_name}...")
    for img_name in os.listdir(class_dir):
        
        
        img_path = os.path.join(class_dir, img_name)
        img = cv2.imread(img_path)
        
        # Verificar si la imagen fue leída correctamente
        if img is None:
            print(f"Error al leer la imagen {img_path}. Puede que no sea una imagen válida o esté dañada.")
            continue
        
        
        # Copiar la imagen
        new_img_path = os.path.join(CLASS_PATH, f"{class_name}_{img_name}")
        cv2.imwrite(new_img_path, img)


    print(f"Se han copiado {len(os.listdir(CLASS_PATH))} imágenes de la clase {class_name}")
    print(f"Generando imágenes aumentadas para la clase {class_name}...")
    
    while(len(os.listdir(CLASS_PATH)) < 1000):
        
        probabilidad = ((1000 - len(os.listdir(CLASS_PATH))) / len(os.listdir(CLASS_PATH))) + 0.05
        print(f"Probabilidad: {probabilidad}")
        for img_name in os.listdir(class_dir):
            if len(os.listdir(CLASS_PATH)) >= 1000:
                break
            if np.random.rand() > probabilidad:
                continue
            
            img_path = os.path.join(class_dir, img_name)
            img = cv2.imread(img_path)
            img_array = img.reshape((1, ) + img.shape)
            for batch in data_gen.flow(img_array, batch_size=1, save_to_dir=CLASS_PATH, save_prefix='aug', save_format='png'):
                break
            
            

    print(f"Se han generado {len(os.listdir(CLASS_PATH))} imágenes aumentadas para la clase {class_name}\n-----------------------------------\n")