# Data Pre-processing notebook for Catalyst Project 
## For project :"Utilizing geospatial data for assessing energy security: Mapping small solar home systems using unmanned aerial vehicles and deep learning"

### The layout of this notebook would be as follows: For each experiment there would be the data moving or processing in one chunk of the code

1. Exp 1.1 (Fig 5) different resolution:
- Stratify the dataset into different resolution
2. Exp 1.2 (Fig 4) satellite resolution:
- Change resolution to satellite resolution
3. Exp 1.3: (appendix result) change resolution to test set resolution:
- Change resolution to test set resolution
4. Exp 1.4: (Fig 6) flying speed
- Stratify the dataset into two different speed
5. Exp 1.5: (appendix result) Apply artificial motion blur


In [None]:
# First lets import some packages
import numpy as np
import shutil
import os
import pandas as pd
from shutil import copyfile

In [None]:
# Common helper function of getting the height of the imagery
def get_height(img_name):
    """
    return the int height of a img
    """
    if 'height' in img_name:
        return int(img_name.split('height_')[-1].split('m')[0])
    else:
        print('Your file {} does not have height information in its name, aborting!'.format(img_name))
        return None
def get_d_group_number(height):
    """
    Return the group number (1~4) according to the height value
    """
    if height < 45:
        return None
    elif height < 65:
        return 1
    elif height < 85:
        return 2
    elif height < 105:
        return 3
    elif height < 125: 
        return 4
    else:
        return None

## Exp 1.1 (Fig 5): Stratify the dataset into different resolution

### Stratify the train/val/test into 4 different categories (d1-d4):
1. d1 = 45-65m height
2.  d2 = 65m-85m height
3. d3 = 85m-105m height
4. d4 = 105m-125m height

In [None]:
target_folder = 'Exp1_1_resolution_buckets'
# validation_ratio = 0.2              # Set the validation set ratio
limits = [45, 65, 85, 105, 125]     # Set the height limit list
lbl_flag_list = ['img','labels']    # The flag folder name whether this is a label folder or not
train_val_list = ['train_val', 'test']
train_val_folder = 'all/train_val_set'
test_folder = 'all/test_set'

# First create the folders to be moved
for i in range(1, 5):
    for lbl_flag in lbl_flag_list:
        for train_val_flag in train_val_list:
            folder = os.path.join(target_folder, train_val_flag, 'd{}'.format(i), lbl_flag)
            if os.path.isdir(folder):
                continue
            os.makedirs(folder)

# Loop over the train_val dataset
for file in os.listdir(train_val_folder):
    if file.endswith('png'):    # If this is png, this is a label, skip
        continue
    
    # Get the height and thus group number of the imagery
    cur_img_height = get_height(file)
    d_number = get_d_group_number(cur_img_height)

    if d_number is None: # If this imagery is not part of the groups
        continue
    
    # The raw image
    copyfile(os.path.join(train_val_folder, file), 
            os.path.join(target_folder, 'train_val', 'd{}'.format(d_number), 'img', file))
    # The label
    copyfile(os.path.join(train_val_folder, file.replace('JPG','png')), 
            os.path.join(target_folder, 'train_val', 'd{}'.format(d_number), 'labels', file.replace('JPG','png')))
    # if np.random.random() < validation_ratio:       # This is a validation set
    #     # The raw image
    #     copyfile(os.path.join(train_val_folder, file), 
    #             os.path.join(target_folder, 'valid', 'd{}'.format(d_number), 'img', file))
    #     # The label
    #     copyfile(os.path.join(train_val_folder, file.replace('JPG','png')), 
    #             os.path.join(target_folder, 'valid', 'd{}'.format(d_number), 'labels', file.replace('JPG','png')))
    # else:                                           # This is training set
    #     # The raw image
    #     copyfile(os.path.join(train_val_folder, file), 
    #             os.path.join(target_folder, 'train', 'd{}'.format(d_number), 'img', file))
    #     # The label
    #     copyfile(os.path.join(train_val_folder, file.replace('JPG','png')), 
    #             os.path.join(target_folder, 'train', 'd{}'.format(d_number), 'labels', file.replace('JPG','png')))

# Loop over the test dataset
for file in os.listdir(test_folder):
    if file.endswith('png'):    # If this is png, this is a label, skip
        continue
    
    # Get the height and thus group number of the imagery
    cur_img_height = get_height(file)
    d_number = get_d_group_number(cur_img_height)

    if d_number is None: # If this imagery is not part of the groups
        continue
    
    # The raw image
    copyfile(os.path.join(test_folder, file), 
            os.path.join(target_folder, 'test', 'd{}'.format(d_number), 'img', file))
    # The label
    copyfile(os.path.join(test_folder, file.replace('JPG','png')), 
            os.path.join(target_folder, 'test', 'd{}'.format(d_number), 'labels', file.replace('JPG','png')))

## Exp 1.2 (Fig 4) satellite resolution: Change resolution to satellite resolution

### Stratify the train/val/test into 7.5cm, 15cm, 30cm & 60cm

Here, we are only using the lowest resolution ones since they are the closest to the target resolution, namely d4 category above

In [None]:
# Some helper functions and packages
import cv2
from skimage import io
from skimage.util import pad

def get_current_res(height):
    return height * 6.17 / 4000 / 4.3

def change_resolution(img_name, target_res):
    """
    The helper function to change resolution
    """
    img = io.imread(img_name)
    height = get_height(img_name)                                       # Get height
    current_res = get_current_res(height)                               # Get current resolution
    scale = current_res / target_res                                       # Calculate scale 
    new_shape = (int(scale*img.shape[1]), int(scale*img.shape[0]))      # prepare new shape
    resized = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resize to small size
    resize_back = cv2.resize(resized, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_LINEAR) # Rescale back to larger size
    return resize_back

def change_height_to_new_height(master_folder, target_height, dest_folder):
    """
    The function that changes the resolution of image folder (both images and annotations) to another resolution
    """
    # Get the folder names
    image_folder = os.path.join(master_folder, 'img')
    annotation_folder = os.path.join(master_folder, 'labels')
    dest_image_folder = os.path.join(dest_folder, 'img')
    dest_annotation_folder = os.path.join(dest_folder, 'labels')
    
    # If the folder does not exist, create this folder
    if not os.path.exists(dest_image_folder):
        os.makedirs(dest_image_folder)
    if not os.path.exists(dest_annotation_folder):
        os.makedirs(dest_annotation_folder)
    
    # Loop inside the folders to change
    for file in os.listdir(image_folder):
        # Make sure it is a image
        if not (file.endswith('.png') or file.endswith('.jpg') or file.endswith('.JPG')):
            continue
        # Change the resolution
        resized_image = change_resolution(img_name=os.path.join(image_folder, file), 
                                            target_res=get_current_res(target_height))
        # Save the image
        io.imsave(os.path.join(dest_image_folder, file), resized_image)
    
    # Loop inside the folders to change
    for file in os.listdir(annotation_folder):
        # Make sure it is a image
        if not (file.endswith('.png') or file.endswith('.jpg') or file.endswith('.JPG')):
            continue
        # Change the resolution
        resized_image = change_resolution(img_name=os.path.join(annotation_folder, file), 
                                            target_res=get_current_res(target_height))
        # Save the image
        io.imsave(os.path.join(dest_annotation_folder, file), resized_image)
    


In [None]:
sat_height_list = [210, 420, 840, 1680]         # These are corresponding height of drone flight to reach 7.5, 15, 30, 60cm GSD
res_list = [7.5, 15, 30, 60]
E1_2_target_folder = 'Exp1_2_sat_res'
for ind, sat_height in enumerate(sat_height_list):
    for train_status in ['train_val','test']:
        cur_folder = os.path.join('Exp1_1_resolution_buckets', train_status, 'd4')    # Get the Exp1.1 d4 folder
        change_height_to_new_height(master_folder=cur_folder, target_height=sat_height, 
                dest_folder=os.path.join(E1_2_target_folder, 'res_{}'.format(res_list[ind]), train_status))

## Exp 1.3: (appendix result) change resolution to test set resolution:
Make a new set of test set that is changed in the resolution

In [None]:
new_test_height_list = [55, 75, 95, 115]         # These are corresponding height of drone flight to reach 7.5, 15, 30, 60cm GSD
E1_3_target_folder = 'Exp1_3_changed_testset'
for i in range(1, 5):
    cur_folder = os.path.join('Exp1_1_resolution_buckets', 'test', 'd{}'.format(i))    # Get the Exp1.1 test folders
    for j in range(1, 5):
        if i == j: # If the train test comes from same height, ignore
            continue
        change_height_to_new_height(master_folder=cur_folder, target_height=new_test_height_list[j-1], 
                dest_folder=os.path.join(E1_3_target_folder, 'd{}_changed_to_d{}'.format(i, j), 'test'))

## Exp 1.4: (Fig 6) flying speed: Stratify the dataset into two different speed

There are two flying speed, N for normal and S for sports mode.  They would all be test set and our models trained would be applied on them.

In [None]:
target_folder = 'Exp1_4_moving_imgs'
lbl_flag_list = ['img','labels']    # The flag folder name whether this is a label folder or not
storage_folder = 'all/moving_labeled/'
# First create the folders to be moved
for i in range(1, 5):
    for lbl_flag in lbl_flag_list:
            folder = os.path.join(target_folder, 'd{}_S_mode'.format(i),'test', lbl_flag)
            if os.path.isdir(folder):
                continue
            os.makedirs(folder)
            folder = os.path.join(target_folder, 'd{}_N_mode'.format(i),'test', lbl_flag)
            if os.path.isdir(folder):
                continue
            os.makedirs(folder)

# Loop over the test dataset
for lbl_flag in lbl_flag_list:
    for mode in ['S','N']:
        folder = os.path.join(storage_folder, lbl_flag, '{}_mode'.format(mode))
        for file in os.listdir(folder):
            # Get the height and thus group number of the imagery
            cur_img_height = get_height(file)
            d_number = get_d_group_number(cur_img_height)

            if d_number is None: # If this imagery is not part of the groups
                continue

            # Copy the file
            copyfile(os.path.join(folder, file), 
                    os.path.join(target_folder, 'd{}_{}_mode'.format(d_number, mode),'test', lbl_flag, file))

## Exp 1.5: (appendix result) Apply artificial motion blur

In [None]:
def apply_motion_blur_and_save(src_img_dir, dest_img_dir, motion_blur_kernel_size, src_img_postfix='.JPG', save_label_postfix='.png'):
    """
    This function applies motion blur to image and save them. The motion blur part is inspired by: https://www.geeksforgeeks.org/opencv-motion-blur-in-python/
    :param src_img_dir, dest_img_dir: The source image directory to get the images and destination directory to save the imagery
    :param motion_blur_kernel: The size of the kernel of convolution, the larger the larger the motion is (-1 is the motion blur pixel)
    :param src_img_postfix: The postfix of the images to be blurred
    :param save_label_postfix: The postfix of the label images to save, if None it means that don't copy the label images there (default is .png and yes copy them as well without touching them)
    """
    # Make sure destination exists
    if not os.path.isdir(dest_img_dir):
        os.makedirs(dest_img_dir)
    
    # Set up the kernels for blurred
    kernel = np.zeros((motion_blur_kernel_size, motion_blur_kernel_size))
    # Fill the middle row with ones.
    kernel[:, int((motion_blur_kernel_size - 1)/2)] = np.ones(motion_blur_kernel_size)
    # Normalize.
    kernel /= motion_blur_kernel_size

    for img_name in os.listdir(src_img_dir):
        # Only process the ones that has the pre-defined postfix
        if not img_name.endswith(src_img_postfix):
            continue
        cur_img_name = os.path.join(src_img_dir, img_name)
        # Add motion blur to this image
        img = cv2.imread(cur_img_name)
        img_mb = cv2.filter2D(img, -1, kernel)

        # Save this image
        cv2.imwrite(os.path.join(dest_img_dir, img_name), img_mb)

        # If the label exist and we want to copy it as well, copy it to new destination
        if save_label_postfix is not None:
            label_file = cur_img_name.replace(src_img_postfix, save_label_postfix)
            if os.path.exists(label_file):
                shutil.copyfile(label_file, label_file.replace(src_img_dir, dest_img_dir))

def copy_files_in_list(scr_dir, dest_dir, file_list):
    """
    Copy all files/dir in scr_dir to dest_dir if name in the file_list
    """
    for file_or_dir in os.listdir(scr_dir):
        if file_or_dir in file_list:
            shutil.copytree(os.path.join(scr_dir, file_or_dir), os.path.join(dest_dir, file_or_dir))

motion_blur_list = [3, 5]
src_dir_list = ['Exp1_1_resolution_buckets/test/d{}'.format(i) for i in range(1, 5)]
target_dir = 'Exp1_5_artificial_motion_blur'
for src_dir in src_dir_list:
    for motion_blur in motion_blur_list:
        # The training cases in h2, h3
        dest_dir = os.path.join(target_dir, os.path.basename(src_dir) + '_mb_{}'.format(motion_blur - 1))
        apply_motion_blur_and_save( os.path.join(src_dir, 'img'), 
                                    os.path.join(dest_dir, 'test', 'img'), 
                                    motion_blur_kernel_size=motion_blur)
        os.system('cp -r {} {}'.format(os.path.join(src_dir, 'labels'), os.path.join(dest_dir, 'test', 'labels')))

## Helper functions that were used in the history, now useless

In [None]:
# Helper functions
def label_each_img_with_folder_name():
    """
    Append the folder_name to the file names
    """
    for folder in os.listdir('.'):
        # Only work in the folders with 2021 in the folder name
        if '2021' not in folder or not os.path.isdir(folder):
            continue
        for file in os.listdir(folder):
            cur_file = os.path.join(folder, file)
            # Rename the file
            os.rename(cur_file, os.path.join('all', folder+file))
label_each_img_with_folder_name()
