## Importando variáveis de ambiente
Esse notebook prevê a existência de 2 variáveis de ambiente no arquivo .env desse projeto:
- DATA_FOLDER
- DATASET_FOLDER

In [6]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path=".env")

DATA_FOLDER = os.getenv("DATA_FOLDER")
DATASET_FOLDER = os.getenv("DATASET_FOLDER")

## Bibliotecas Utilizadas

In [None]:
import random
from pathlib import Path
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

## Variáveis de configuração

In [8]:
SOURCE_DIR = DATASET_FOLDER
OUTPUT_DIR = os.path.join(DATA_FOLDER, 'splits')

TRAIN_SPLIT = 0.7
VAL_SPLIT = 0.15
TEST_SPLIT = 0.15

AUG_PER_IMAGE = 3 # Quantas versões aumentadas (augmentations) por imagem

## Definindo as transform para augmentation offline

In [None]:
augmentation_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    
    transforms.RandomApply([
        transforms.RandomAffine(
            degrees=25,
            translate=(0.10, 0.10),   # deslocamento leve
            scale=(0.95, 1.05),       # zoom leve
            shear=10
        )
    ], p=0.8),

    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.15),

    transforms.RandomPerspective(distortion_scale=0.2, p=0.5),

    transforms.ColorJitter(
        brightness=0.4,
        contrast=0.4,
        saturation=0.4,
        hue=0.05
    ),

    transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
    transforms.RandomAdjustSharpness(sharpness_factor=2, p=0.3),
    transforms.RandomAutocontrast(p=0.3),
])

# Transformação base (sem aumento)
basic_transform = transforms.Resize((224, 224))

## Funções para salvar as imagens e processá-las (split e augmentation)

In [39]:
def save_image(img: Image.Image, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    img.save(path)

def process_class(class_path: Path):
    images = list(class_path.glob("*.jpg")) + list(class_path.glob("*.png"))
    random.shuffle(images)
    class_name = class_path.name

    n_total = len(images)
    n_train = int(n_total * TRAIN_SPLIT)
    n_val = int(n_total * VAL_SPLIT)

    splits = {
        "train": images[:n_train],
        "val": images[n_train:n_train + n_val],
        "test": images[n_train + n_val:],
    }

    for split_name, split_images in splits.items():
        for img_path in tqdm(split_images, desc=f"{class_name} - {split_name}"):
            img = Image.open(img_path).convert("RGB")

            # Salva versão sem augmentation
            out_path = OUTPUT_DIR / split_name / class_name / img_path.name
            save_image(basic_transform(img), out_path)

            # Gera imagens aumentadas apenas para o split de treino
            if split_name == "train":
                for i in range(AUG_PER_IMAGE):
                    aug_img = augmentation_transform(img)
                    aug_name = img_path.stem + f"_aug{i}.jpg"
                    out_aug_path = OUTPUT_DIR / split_name / class_name / aug_name
                    save_image(aug_img, out_aug_path)

## Execução

In [None]:
classes = [p for p in SOURCE_DIR.iterdir() if p.is_dir()]
for cls_path in classes:
    process_class(cls_path)

Gobio gobio - train: 100%|██████████| 159/159 [00:07<00:00, 21.28it/s]
Gobio gobio - val: 100%|██████████| 34/34 [00:00<00:00, 280.26it/s]
Gobio gobio - test: 100%|██████████| 35/35 [00:00<00:00, 282.38it/s]
Gasterosteus aculeatus - train: 100%|██████████| 40/40 [00:02<00:00, 19.03it/s]
Gasterosteus aculeatus - val: 100%|██████████| 8/8 [00:00<00:00, 266.03it/s]
Gasterosteus aculeatus - test: 100%|██████████| 10/10 [00:00<00:00, 336.02it/s]
Neogobius fluviatilis - train: 100%|██████████| 71/71 [00:03<00:00, 21.48it/s]
Neogobius fluviatilis - val: 100%|██████████| 15/15 [00:00<00:00, 335.28it/s]
Neogobius fluviatilis - test: 100%|██████████| 16/16 [00:00<00:00, 314.44it/s]
Barbus barbus - train: 100%|██████████| 235/235 [00:12<00:00, 19.27it/s]
Barbus barbus - val: 100%|██████████| 50/50 [00:00<00:00, 207.50it/s]
Barbus barbus - test: 100%|██████████| 51/51 [00:00<00:00, 224.70it/s]
Neogobius melanostomus - train: 100%|██████████| 170/170 [00:08<00:00, 20.53it/s]
Neogobius melanostomus 