# 2.1 Data Preprocessing
In this Notebook we will preprocess the data we generated earlier. We will:
- Categorize trees into the 4 classes
- Normalize the image data
- Split the data into training, validation and test sets
- Apply data augmentation techniques
- Perform oversampling
- One-hot encode the labels

__Note:__ For demonstration purposes we will only process the TDOP data in this Notebook. Processing vegetation height and other data in raster format works the same way

### Importing needed libraries & packages

In [1]:
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

from sklearn.model_selection import train_test_split

### Loading, relabeling the data and encoding the data

In [2]:
# Define the path to the data file
data_path = './data/TDOP.npz'

# Load the image array containing all box images in a 4D array and their
# labels
with np.load(data_path, allow_pickle=True) as data:
    img_array = data["trees"]
    labels = data["labels"]

img_array = img_array[[4,4,4,4,4],:,:,:,:] # after loading the data we only the take the most current picture information
labels = labels[[4,4,4,4,4],:].astype("object") 

Tensorflow prefers integers as target values, therefore we have to transform our labels.
 
As a first step, create a new array filled with zeros and in the same shape as the "labels". 
The we replace all zeros with integers as in the position where one of the 3 tree species of interest are.

In [3]:
labels_int = np.zeros_like(labels)

# Loop through each label
for i in range(len(labels[1])):
    
    # Assign a 1 if it is a Rotbuche
    if labels[1][i] == 'Rotbuche':
        labels_int[:,i] = 1

    # Assign a 2 if it is Ahorn    
    elif labels[1][i] == 'Berg-Ahorn' or labels[1][i] == 'Feld-Ahorn':
        labels_int[:,i] = 2

    # Assign a 3 if it is a Birke    
    elif labels[1][i] == 'Haenge-Birke' or labels[1][i] == 'Moor-Birke':
        labels_int[:,i] = 3
    
# The array is still of type "obj", we need to change it to "int"
labels_int = labels_int.astype('int')

We will do the one-hot encoding later just before saving the preprocessed data for the model. What we will do now is normalizing our 16-bit image data to the 0 - 1 range.

In [4]:
img_array = tf.keras.utils.normalize(img_array, axis=-1, order=2)

In [5]:
img_array.shape

(5, 1703, 35, 35, 4)

### Spliting the data into training, validation and test sets

In [6]:
# checking how many channels the image data has, 4 for standard RGB+IR TDOP images, 3 would be RGB only
nr_channels = img_array.shape[-1]

# Reshape the arrays 
reshaped_img = img_array.reshape(-1, 35, 35, nr_channels)
reshaped_labels = labels_int.reshape(-1)

# Setting the split ratios
train_size = 0.8
val_size = 0.1
test_size =  val_size / (1 - train_size)
# Generate indices for splitting
idx_train, idx_temp, labels_train, labels_temp = train_test_split(np.arange(len(reshaped_img)), reshaped_labels, train_size=train_size, stratify=reshaped_labels)
idx_val, idx_test, labels_val, labels_test = train_test_split(idx_temp, labels_temp, test_size=test_size, stratify=labels_temp)

# Split the data and reshape back
img_train = reshaped_img[idx_train]
img_val = reshaped_img[idx_val]
img_test = reshaped_img[idx_test]

# Reshape labels arrays
labels_train = labels_train.reshape(-1)
labels_val = labels_val.reshape(-1)
labels_test = labels_test.reshape(-1)

Again lets see how the different sets look like now:

In [7]:
print(img_train.shape)
print(labels_train.shape)

(6812, 35, 35, 4)
(6812,)


### Perform data augmentation

In [8]:
# Define a function that takes a dataset of images and augments each image
# once in a certain way
def augment_images(img_array):

    # Predefine an array that will contain the augmented images
    img_array_full = np.tile(img_array, (12,1,1,1,1))

    # Loop through each image
    for i in range(img_array.shape[0]):

        # Augmentation number 0: Nothing
        img_aug = img_array_full[0,i,:,:,:]
        img_array_full[0,i,:,:,:] = img_aug

        # Augmentation number 1: 90° rotation        
        img_aug = tf.image.rot90(img_array[i], k=1).numpy()
        img_array_full[1,i,:,:,:] = img_aug

        # Augmentation number 2: 90° rotation + vertical flip
        img_aug = tf.image.rot90(img_array[i], k=1).numpy()
        img_aug = tf.image.flip_up_down(img_aug).numpy()
        img_array_full[2,i,:,:,:] = img_aug

        # Augmentation number 3: 90° rotation + horizontal flip
        img_aug = tf.image.rot90(img_array[i], k=1).numpy()
        img_aug = tf.image.flip_left_right(img_aug).numpy()
        img_array_full[3,i,:,:,:] = img_aug

        # Augmentation number 4: 90° rotation + vertical flip +
        # horizontal flip
        img_aug = tf.image.rot90(img_array[i], k=1).numpy()
        img_aug = tf.image.flip_up_down(img_aug).numpy()
        img_aug = tf.image.flip_left_right(img_aug).numpy()
        img_array_full[4,i,:,:,:] = img_aug

        # Augmentation number 5: 270° rotation
        img_aug = tf.image.rot90(img_array[i], k=3).numpy()
        img_array_full[5,i,:,:,:] = img_aug

        # Augmentation number 6: 270° rotation + vertical flip
        img_aug = tf.image.rot90(img_array[i], k=3).numpy()
        img_aug = tf.image.flip_up_down(img_aug).numpy()
        img_array_full[6,i,:,:,:] = img_aug

        # Augmentation number 7: 90° rotation + horizontal flip
        img_aug = tf.image.rot90(img_array[i], k=3).numpy()
        img_aug = tf.image.flip_left_right(img_aug).numpy()
        img_array_full[7,i,:,:,:] = img_aug

        # Augmentation number 8: 270° rotation + vertical flip +
        # horizontal flip
        img_aug = tf.image.rot90(img_array[i], k=3).numpy()
        img_aug = tf.image.flip_up_down(img_aug).numpy()
        img_aug = tf.image.flip_left_right(img_aug).numpy()
        img_array_full[8,i,:,:,:] = img_aug

        # Augmentation number 9: vertical flip + horizontal flip
        img_aug = tf.image.flip_up_down(img_array[i]).numpy()
        img_aug = tf.image.flip_left_right(img_aug).numpy()
        img_array_full[9,i,:,:,:] = img_aug

        # Augmentation number 10: vertical flip
        img_aug = tf.image.flip_up_down(img_array[i]).numpy()
        img_array_full[10,i,:,:,:] = img_aug

        # Augmentation number 11: horizontal flip
        img_aug = tf.image.flip_left_right(img_array[i]).numpy()
        img_array_full[11,i,:,:,:] = img_aug

        # Report progress
        if (i%1000 == 0) & (i != 0):
            print(' - ' + str(round(100*i/img_array.shape[0])) + 
            '% of all images have been augmented.')
        elif (i%100 == 0) & (i != 0):
            print('.', end='')
    return img_array_full

In [9]:

# Augment data
print('Augmenting training images')
img_train_full = augment_images(img_train)
print('\nAugmenting labels')
labels_train_full = np.tile(labels_train, (12,1))

Augmenting training images
......... - 15% of all images have been augmented.
......... - 29% of all images have been augmented.
......... - 44% of all images have been augmented.
......... - 59% of all images have been augmented.
......... - 73% of all images have been augmented.
......... - 88% of all images have been augmented.
........
Augmenting labels


In [10]:

# Reshape augmented data
img_train = np.reshape(img_train_full, (-1,35,35,4))
labels_train = np.reshape(labels_train_full, (-1))

How many more pictures did we get by augmenting the dataset?

In [11]:
print(img_train.shape)

(81744, 35, 35, 4)


In [12]:
labels_train.shape

(81744,)

### Oversample the data
We oversample the training data by randomly picking images until all classes have the same amount of samples

In [13]:
# Calculate the frequency of each class in the data
_, n_all = np.unique(labels_train, return_counts=True)

# Determine the class with the most data
n_max = np.max(n_all)

# Generate a new array that contains the oversampled image results
img_train_os = img_train.copy()

# Generate a new array that contains the oversampled label results
labels_train_os = labels_train.copy()

# Loop through all class categories
for i in range(len(n_all)):

    # Extract the frequency of this class
    n_class = n_all[i]

    # Execute this code if this is not the class with the highest
    # frequency
    if (n_class != n_max):

        # Extract only images of this class
        img_class = img_train[labels_train == i]

        # Extract only labels of this class
        label_class = labels_train[labels_train == i]

        # Generate as many random integers as there are in the class
        # category with the highest frequency minus the number of already
        # existing images for this class category
        rand_ind = np.random.randint(0, img_class.shape[0]-1,
                                     n_max - n_class)

        # Draw random images from the existing images
        img_rand = img_class[rand_ind,:,:,:]

        # Draw the very same random labels
        label_rand = label_class[rand_ind]

        # Append those copied images to the image array
        img_train_os = np.append(img_train_os, img_rand, axis=0)

        # Append the very same random labels to the labels array
        labels_train_os = np.append(labels_train_os, label_rand, axis=0)

Compare the size of the training sets before and after oversampling:

In [14]:
print(img_train.shape)
print(img_train_os.shape)

(81744, 35, 35, 4)
(181824, 35, 35, 4)


### One-hot encode the labels

In [15]:
labels_train_os = tf.keras.utils.to_categorical(labels_train_os)
labels_test = tf.keras.utils.to_categorical(labels_test)
labels_val = tf.keras.utils.to_categorical(labels_val)

Now we are done and can save the data back to a `.npz` file

In [16]:
np.savez("./data/data_preprocessed",img_train=img_train_os,labels_train=labels_train_os,img_test=img_test,labels_test=labels_test,img_val=img_val,labels_val=labels_val)