In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
import random
import shutil
import cv2
import albumentations as A
from tqdm import tqdm

# 1. Configuración
ADENOMA_IMG = '/content/drive/MyDrive/TFG/colonoscopias/Adenomas'
SERRADO_IMG = '/content/drive/MyDrive/TFG/colonoscopias/Serrados'

OUT_DIR = '/content/drive/MyDrive/TFG/dataset_clasificacion'
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(OUT_DIR, split, 'Adenoma'), exist_ok=True)
    os.makedirs(os.path.join(OUT_DIR, split, 'Serrado'), exist_ok=True)



Mounted at /content/drive


In [None]:
# 2. Aumentaciones
augment_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(0.2, 0.2, p=0.5),
    A.Rotate(limit=15, p=0.5),
    A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.3),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0, border_mode=0, p=0.5),
    A.Affine(shear=(-10, 10), p=0.3),
    A.CoarseDropout(max_holes=4, max_height=16, max_width=16, fill_value=0, p=0.2)
])



  original_init(self, **validated_kwargs)
  A.CoarseDropout(max_holes=4, max_height=16, max_width=16, fill_value=0, p=0.2)


In [None]:
# 3. División de datos
def split_data(file_list, train_ratio=0.7, val_ratio=0.2):
    random.shuffle(file_list)
    n = len(file_list)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    return file_list[:n_train], file_list[n_train:n_train+n_val], file_list[n_train+n_val:]

# Listar imágenes
def list_images(folder):
    return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]

adenoma_imgs = list_images(ADENOMA_IMG)
serrado_imgs = list_images(SERRADO_IMG)

ade_train, ade_val, ade_test = split_data(adenoma_imgs)
ser_train, ser_val, ser_test = split_data(serrado_imgs)



In [None]:
# 4. Guardar con aumentos
def save_augmented(img_paths, label, split, n_aug=3):
    out_path = os.path.join(OUT_DIR, split, label)
    for img_path in tqdm(img_paths, desc=f"{label} {split}"):
        base = os.path.splitext(os.path.basename(img_path))[0]
        img_bgr = cv2.imread(img_path)
        if img_bgr is None:
            continue
        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

        # Guardar original
        dst_path = os.path.join(out_path, base + '.jpg')
        cv2.imwrite(dst_path, img_bgr)

        # Solo aumentar en train
        if split == 'train':
            for i in range(n_aug):
                transformed = augment_transform(image=img_rgb)
                aug_img = cv2.cvtColor(transformed['image'], cv2.COLOR_RGB2BGR)
                aug_path = os.path.join(out_path, f"{base}_aug{i}.jpg")
                cv2.imwrite(aug_path, aug_img)



In [None]:
# 5. Ejecutar todo
save_augmented(ade_train, 'Adenoma', 'train')
save_augmented(ade_val, 'Adenoma', 'val', n_aug=0)
save_augmented(ade_test, 'Adenoma', 'test', n_aug=0)

save_augmented(ser_train, 'Serrado', 'train')
save_augmented(ser_val, 'Serrado', 'val', n_aug=0)
save_augmented(ser_test, 'Serrado', 'test', n_aug=0)

print("✅ Dataset de clasificación listo con aumentaciones.")


Adenoma train: 100%|██████████| 214/214 [00:32<00:00,  6.54it/s]
Adenoma val: 100%|██████████| 61/61 [00:02<00:00, 25.88it/s]
Adenoma test: 100%|██████████| 32/32 [00:02<00:00, 10.77it/s]
Serrado train: 100%|██████████| 207/207 [03:09<00:00,  1.09it/s]
Serrado val: 100%|██████████| 59/59 [00:50<00:00,  1.18it/s]
Serrado test: 100%|██████████| 30/30 [00:22<00:00,  1.32it/s]

✅ Dataset de clasificación listo con aumentaciones.



