<a href="https://colab.research.google.com/github/VSUrhuel/forage-classifier/blob/main/Splitting_and_Augmenting_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
original_dataset_dir = '/content/drive/MyDrive/_Shared/dataset'
new_base_dir = '/content/drive/MyDrive/Thesis/Dataset'

os.makedirs(new_base_dir, exist_ok=True)
train_dir = os.path.join(new_base_dir, 'train')
val_dir = os.path.join(new_base_dir, 'val')
test_dir = os.path.join(new_base_dir, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

In [None]:
classes = ['carabao-grass', 'centro', 'gliricidia', 'leucaena', 'para-grass']
for cls in classes:
    os.makedirs(os.path.join(train_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(val_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(test_dir, cls), exist_ok=True)

train_alloc = 0.8
val_alloc = 0.1
test_alloc = 0.1

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1/255.0,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [None]:
global_index = 1
for cls in classes:
  dir = os.path.join(original_dataset_dir, cls)
  images = os.listdir(dir)
  np.random.shuffle(images)

  train_split = int(train_alloc * len(images))
  val_split = int((train_alloc + val_alloc) * len(images))

  train_images = images[0:train_split]
  val_images = images[train_split:val_split]
  test_images = images[val_split:]

  for img in train_images:
    img_path = os.path.join(dir, img)
    orig_name = f"{cls}_{global_index:04d}_orig.jpg"
    orig_dst = os.path.join(train_dir, cls, orig_name)
    if os.path.exists(orig_dst):
      global_index += 4
      continue

    image = cv2.imread(img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    orig_name = f"{cls}_{global_index:04d}_orig.jpg"
    cv2.imwrite(os.path.join(train_dir, cls, orig_name), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
    global_index += 1
    image = np.expand_dims(image, axis=0)
    augmentation_iter = train_datagen.flow(image, batch_size=1)

    for i in range(3):
      augmented_name = f"{cls}_{global_index+1+i:04d}_aug{i+1}.jpg"
      aug_dst = os.path.join(train_dir, cls, augmented_name)
      if os.path.exists(aug_dst):
        continue

      augmented_image = next(augmentation_iter)[0]
      if augmented_image.max() <= 1.0:
        augmented_image = (augmented_image * 255).astype(np.uint8)
      else:
        augmented_image = augmented_image.astype(np.uint8)
      cv2.imwrite(os.path.join(train_dir, cls, augmented_name), cv2.cvtColor(augmented_image, cv2.COLOR_RGB2BGR))
    global_index +=4


  for img in val_images:
    name = f"{cls}_{global_index:04d}.jpg"
    val_dst = os.path.join(val_dir, cls, name)
    if(os.path.exists(val_dst)):
      global_index += 1
      continue
    src = os.path.join(dir, img)
    dst = os.path.join(val_dir, cls, name)
    shutil.copy(src, dst)
    global_index += 1

  for img in test_images:
    name = f"{cls}_{global_index:04d}.jpg"
    test_dst = os.path.join(test_dir, cls, name)
    if(os.path.exists(test_dst)):
      global_index += 1
      continue
    src = os.path.join(dir, img)
    dst = os.path.join(test_dir, cls, name)
    shutil.copy(src, dst)
    global_index += 1

print(f"Augmentation complete! Processed {global_index-1} total images.")

Augmentation complete! Processed 2008 total images.
