<a href="https://colab.research.google.com/github/Stan081/AML_CourseWork/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*PREPROCESSING*

This notebook is dedicated to preprocessing the datasets used for this project. It contains the necessary functions to ensure the datasets are ready for training all models and a final output size of **(axb)**. The notebook is divided into two parts.
Part one is for preprocessing of dataset 1.
Part two is for preprocessing of dataset 2

**Dataset One**

**Dataset Two**

In [None]:
import numpy as np
import cv2 as cv
import shutil
import math
import os

In [None]:
# Original Image size in Inria Aerial Image Dataset
master_size = 5000

# Desired tile size
image_size = 384

# Overlap percentage
overlap = 0.3

# working directory
root_folder = os.getcwd()

# Original dataset folder
data_folder = os.path.join(root_folder, 'Downloads/AerialImageDataset_small/')

In [None]:
# Original dataset folders
src_train_folder = os.path.join(data_folder, 'train', 'images')
src_train_folder_gt = os.path.join(data_folder, 'train', 'gt')
src_test_folder = os.path.join(data_folder, 'test', 'images')

print('Training images address    = ', src_train_folder)
print('Training gt images address = ', src_train_folder_gt)
print('Testing images address     = ', src_test_folder)

In [None]:
# Training set file names
src_train_images = os.listdir(src_train_folder)
print(src_train_images)
print('Total number of main images = ',len(src_train_images))

In [None]:
# Testing set file names
src_test_images = os.listdir(src_test_folder)
print(src_test_images)

In [None]:
# set destination folders

train_folder_root = os.path.join(data_folder, 'train_{}x{}'.format(image_size, image_size))
train_folder = os.path.join(train_folder_root, 'images')
train_folder_gt = os.path.join(train_folder_root, 'gt')

print(train_folder_root)
print(train_folder)
print(train_folder_gt)

In [None]:
# Creating destination folder root
if not os.path.exists(train_folder_root):
    os.makedirs(train_folder_root)

# Creating destination training folder
if not os.path.exists(train_folder):
    os.makedirs(train_folder)
else:
    shutil.rmtree(train_folder)

# Creating destination training gt folder
if not os.path.exists(train_folder_gt):
    os.makedirs(train_folder_gt)
else:
    shutil.rmtree(train_folder_gt)

In [None]:
# find the number of tiles with 0.3 percent overlap
#  (5000)/(384-(384*0.3))

count = math.ceil((master_size / (image_size-(image_size*overlap))))

# find the number of pixels for each step
step = (master_size - image_size * overlap) / count
print('count =', count, ', step =', step)

In [None]:
# slicing the main images and ground truths into tiles of 384 * 384 pixels
# each 5000 * 5000 source will be divided into 19 * 19 = 361 tiles

for filename in src_train_images:
    print(filename)
    master_img = cv.imread(os.path.join(src_train_folder, filename))
    master_img_gt = cv.imread(os.path.join(src_train_folder_gt, filename))

    for i in range(count):
        if i < count - 1:
            y = round(i * step)
        else:
            y = master_size - image_size # last tile of the column

        for j in range(count):
            if j < count - 1:
                x = round(j * step)
            else:
                x = master_size - image_size # last tile of the row


# Slice the main image based on (x,y)  first go ====>>> img = master_img[0:0+384, 0:0+384] = img = master_img[0:384,0:384]
            img = master_img[y:y+image_size, x:x+image_size]
            img_gt = master_img_gt[y:y+image_size, x:x+image_size]

# write the slice (tile) into disk
# filename[:-4] ?? to get rid of file extension in original filename (ex: austin1.tif ==>>>  austin1 )

            img_fname = '{}_{}_{}.{}'.format(filename[:-4], i, j, 'jpg')
            img_gt_fname = '{}_{}_{}.{}'.format(filename[:-4], i, j, 'png')
            cv.imwrite(os.path.join(train_folder, img_fname), img)
            cv.imwrite(os.path.join(train_folder_gt, img_gt_fname), img_gt)