# BiteMe | Preprocessing

The purpose of this notebook is to create the image preprocessing pipeline to be used during train/test time. The output will be functions we can include in the `preprocessing.py` script. 

TODO: 
 - Preprocessing pipeline
 - Train/test split
 - Augmentations
 - Write augmented images into `preprocessed/train/<label>/...` and `preprocessed/test/<label>/...`
 - Write metadata including processed images. Write images first with augs, then rename images to hash, then create metadata.
  - [Histogram Equalization and Adaptive Histogram Equalization (CLAHE)](https://pyimagesearch.com/2021/02/01/opencv-histogram-equalization-and-adaptive-histogram-equalization-clahe/)
 - 

In [1]:
import pandas as pd
import numpy as np
import os
import sys

from tqdm import tqdm

import cv2
import albumentations as A
import imgaug as ia
import imgaug.augmenters as iaa

sys.path.append("..")
from helpers import read_images, get_train_test_split, augs#, get_augs
from constants import ROWS, COLS, CHANNELS, SEED, TEST_SIZE, VERBOSE

np.random.seed(SEED)
ia.seed(SEED)

In [2]:
# Define directories
base_dir_path = "../"

data_dir_path = os.path.join(base_dir_path, "data")
data_cleaned_dir_path = os.path.join(data_dir_path, "cleaned")
data_preprocessed_dir_path = os.path.join(data_dir_path, "preprocessed")

data_dir = os.listdir(data_dir_path)
data_cleaned_dir = os.listdir(data_cleaned_dir_path)

metadata_cleaned_path = os.path.join(data_cleaned_dir_path, "metadata.csv")
metadata = pd.read_csv(metadata_cleaned_path)

# Write processed images to disk
write_preprocessed_images = False

metadata.head()

Unnamed: 0,img_name,img_path,label
0,7059b14d2aa03ed6c4de11afa32591995181d31c.jpg,../data/cleaned/none/7059b14d2aa03ed6c4de11afa...,none
1,ea1b100b581fcdb7ddfae52cc62347a99e304ba4.jpg,../data/cleaned/none/ea1b100b581fcdb7ddfae52cc...,none
2,1a1442990ff143b7560e5757d9f76d37ab007f48.jpg,../data/cleaned/none/1a1442990ff143b7560e5757d...,none
3,6eac051b9c45ff6821ec8675216f371711b7cea9.jpg,../data/cleaned/none/6eac051b9c45ff6821ec86752...,none
4,fc72767f8520df9b2b83941077dc0ee013eb9399.jpg,../data/cleaned/none/fc72767f8520df9b2b8394107...,none


## Split Data into Train/Test

In [3]:
# Split data into train and test
train_idx, test_idx, y_train, y_test = get_train_test_split(
    metadata_df=metadata, 
    test_size=TEST_SIZE,
    verbose=VERBOSE
)

192 train images
22 test images

TRAIN IMAGE COUNTS
------------------
tick        26
mosquito    25
horsefly    25
bedbug      25
none        25
ant         23
bee         22
mite        21
Name: label, dtype: int64

TEST IMAGE COUNTS
------------------
bedbug      3
tick        3
ant         3
horsefly    3
mosquito    3
none        3
mite        2
bee         2
Name: label, dtype: int64


In [4]:
# Re-write metadata csv for preprocessed
# WILL NEED TO UPDATE IF THERE WE GENERATE SYNTHETIC IMAGES
metadata["split"] = "train"
metadata["split"][test_idx] = "test"

metadata_preprocessed_path = os.path.join(data_preprocessed_dir_path, "metadata.csv")
metadata.to_csv(metadata_preprocessed_path, index=False)

## Create Preprocessing Pipeline

In [5]:
img_array = read_images(
    data_dir_path=data_cleaned_dir_path, 
    rows=ROWS, 
    cols=COLS, 
    channels=CHANNELS, 
    write_images=False, 
    output_data_dir_path=None,
    verbose=VERBOSE
)

# Split images into train/test
X_train = img_array[train_idx]
X_test = img_array[test_idx]    

Reading images from: ../data/cleaned
Rows set to 512
Columns set to 512
Channels set to 3
Writing images is set to: False
Reading images...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 177.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 94.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 66.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 51.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 41.09it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 36.47it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 31.59it/s]
100%|█

Image reading complete.
Image array shape: (214, 512, 512, 3)


In [6]:
if write_preprocessed_images == True:
    # Make train/test dirs for preprocessed images
    if "train" not in os.listdir(data_preprocessed_dir_path):
        os.mkdir(os.path.join(data_preprocessed_dir_path, "train"))
    if "test" not in os.listdir(data_preprocessed_dir_path):
        os.mkdir(os.path.join(data_preprocessed_dir_path, "test"))


    # Write preprocessed images (split) to preprocessed directory
    for idx in tqdm(metadata.index):
        if metadata["split"][idx] == "train":
            img_dir_path = os.path.join(data_preprocessed_dir_path, "train", metadata["label"][idx])
            # If doesn't exist, create label directory
            if not os.path.isdir(img_dir_path):
                os.mkdir(img_dir_path)
            # Create img write path
            img_path_write = os.path.join(img_dir_path, metadata["img_name"][idx])
            # Write to train img directory
            cv2.imwrite(img_path_write, img_array[idx])
            
        elif metadata["split"][idx] == "test":
            # Write to test directory
            img_dir_path = os.path.join(data_preprocessed_dir_path, "test", metadata["label"][idx])
            # If doesn't exist, create label directory
            if not os.path.isdir(img_dir_path):
                os.mkdir(img_dir_path)
            # Create img write path
            img_path_write = os.path.join(img_dir_path, metadata["img_name"][idx])
            # Write to train img directory
            cv2.imwrite(img_path_write, img_array[idx])

## Run Preprocessing Pipeline

In [None]:
def get_augs(imgs_raw: np.array, 
             labels_raw: np.array, 
             augs: list,
             keep_originals: bool=True, 
             verbose: bool=True) -> np.array:
    """
    Reads raw images and returns array containing augmented images.
    
    Parameters
    ----------
    imgs_raw : np.array
        Array of raw images to augment.
    labels_raw : np.array
        Array of raw labels, retaining order in imgs_raw.
    augs : list
        List of augmentations to perform        
    keep_originals : bool
        If True, appends augmented images to original array, otherwise only returns augmented images.
    verbose : bool
        If True, prints verbose logging.EPOCHS
    
    Returns
    -------
    imgs_aug : np.array
        Array containing augmented images. 
    labels_aug : np.array
        Array containing augmented labels, replicating order in imgs_aug.
    """
    # Define augmentations that can be done 
    fliplr = iaa.Sequential([iaa.Fliplr(p=1)])
    flipud = iaa.Sequential([iaa.Flipud(p=1)])
    gaussianblur = iaa.Sequential([iaa.GaussianBlur(p=1, sigma=6.0)])

    
    
    num_augs = len(augs)
    
    if keep_originals == True:
        # Create augmentations and add to array with original images
        imgs_aug = np.concatenate(
            (
                imgs_raw, # Originals
                fliplr(images=imgs_raw), # Flip horizontally left to right
                flipud(images=imgs_raw), # Flip vertically up to down
            ),
            axis=0 
        )
        
        # Count number of augmentations
        labels_aug = np.concatenate(
            (
                labels_raw,
                np.array([labels_raw for i in range(num_augs)]).flatten()
            ),
            axis=0
        )

    elif keep_originals == False:
        # Create augmentations and add to array without original images
        imgs_aug = np.concatenate(
            (
                fliplr(images=imgs_raw), # Flip horizontally left to right
                flipud(images=imgs_raw), # Flip vertically up to down
            ),
            axis=0 
        )
        
        # Count number of augmentations
        labels_aug = np.array([labels_raw for i in range(num_augs)]).flatten()
        
    # Logging
    if verbose:
        print(f"Used augs: {list(augs)}")
        print(f"Created {imgs_aug.shape[0] - imgs_raw.shape[0]} augmentations.")
        print(f"Image array shape: {imgs_aug.shape}")
        print(f"Labels array shape: {labels_aug.shape}")
    
    return imgs_aug, labels_aug

In [None]:
# Example preprocessing run
X_train_aug, y_train_aug, augs = get_augs(
    imgs_raw=X_train, 
    labels_raw=y_train,
    keep_originals=False,
    verbose=VERBOSE
)

In [8]:
augs

{'Fliplr': {'aug': imgaug.augmenters.flip.Fliplr, 'args': {'p': 1.0}},
 'Flipud': {'aug': imgaug.augmenters.flip.Flipud, 'args': {'p': 1.0}},
 'GaussianBlur': {'aug': imgaug.augmenters.blur.GaussianBlur,
  'args': {'sigma': 6.0}},
 'AverageBlur': {'aug': imgaug.augmenters.blur.AverageBlur,
  'args': {'k': 20.0}},
 'MotionBlur': {'aug': imgaug.augmenters.blur.MotionBlur, 'args': {'k': 15.0}},
 'MultiplyBrightness': {'aug': imgaug.augmenters.color.MultiplyBrightness,
  'args': {'mul': 0.5}},
 'MultiplyHue': {'aug': imgaug.augmenters.color.MultiplyHue,
  'args': {'mul': 0.8}},
 'MultiplySaturation': {'aug': imgaug.augmenters.color.MultiplySaturation,
  'args': {'mul': 0.5}},
 'Grayscale': {'aug': imgaug.augmenters.color.Grayscale, 'args': {'mul': 0.7}},
 'GammaContrast': {'aug': imgaug.augmenters.contrast.GammaContrast,
  'args': {'gamma': 2.0}},
 'SigmoidContrast': {'aug': imgaug.augmenters.contrast.SigmoidContrast,
  'args': {'gain': 9.0}},
 'LinearContrast': {'aug': imgaug.augmenters.c