# Lab 1 - Pre-Processing Data

In this lab, you will use Python to explore and prepare the image data that you will work with in subsequent labs. Specifically, you need to prepare the images to be used as training data for a machine learning model that you will build in later labs.

## Set Up Data Folder Paths
The raw data is provided in the **../data/voc** folder. You must process this data and save the processed versions of the image files in the **../data/classification/training** folder, retaining the same structure of subfolders for each category of image.

In [0]:
## mount files:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import os, os.path
folder = 'Moocs/edx_Microsoft'
os.chdir('/content/drive/My Drive/'+folder)

In [0]:
src_folder = "./data/voc"
train_folder = "./data/classification/training"

## Explore the Images
In the following cell, add code to iterate through subfolders in the **voc** folder, and display the first image in each subfolder. Each subfolder represents a category, or *class*, of image.

> **Hints**:
> - Use the **os.walk** method to iterate through a hierarchy of folders.
> - Use the **os.listdir** method to return a list of files in a subfolder.
> - The first file in a subfolder is the first element of the list (for example, file_list[0]).
> - Use subplots to display the images - you will need to define a grid that shows one image per subfolder.

In [0]:
import os
import matplotlib.pyplot as plt
from matplotlib import image as mp_image

# required to display matplotlib plots in notebooks
%matplotlib inline

def show_img(img_arr): # img_arr = [{name: path}, ...]
    ncols = 3 # show 3 images in a row
    nrows = round(len(img_arr) / ncols)
    idx = 0

    # set up a figure of an appropriate size
    fig = plt.figure(figsize=(12, 12))
    for img_obj in img_arr:
        idx += 1
        name = list(img_obj.keys())[0]
        path = img_obj[name]
        if (os.path.isdir(path)):
            continue
        image = mp_image.imread(path)
        a = fig.add_subplot(nrows, ncols, idx)
        image_plot = plt.imshow(image)
        a.set_title(name)
    plt.show()
        
def explore_images():
    src_folder = "./data/voc"
    train_folder = "./data/classification/training"
    img_arr = []
    
    for root, dirs, files in os.walk(src_folder):    
        for name in dirs:
            if (name == "training"):
                continue
            abs_path = os.path.join(root, name)
            img_path = os.path.join(root, name, sorted(os.listdir(abs_path))[0])
            img_arr.append({name: img_path})
    show_img(img_arr)
    
explore_images()

## Standardize the Images
The images vary in size and shape. Most machine learning techniques for computer vision work best when the image data is a consistent format and size, so you must prepare the data accordingly.

Add code to the following cell to standarize the images so that they are all 128x128 JPG files while retaining their original aspect-ratio, and save them in the same subfolder structure in a new **training** folder.

> **Hints**:
> - Encapsulate the code to resize an image in a function - use the **def** keyword to define a function.
> - To Create a folder, use the **os.makedirs** method.
> - To remove an existing folder that contains files, use the **shutil.rmtree** method.

In [0]:
# demands:
# 1. 128*128
# 2. jpg
# 3. retaining ratio
# 4. save in the same folder's structure in training folder

import os
import shutil
from PIL import Image
import matplotlib.pyplot as plt
import math

# required to display matplotlib plots in notebooks
%matplotlib inline

def show_imgs(img_arr): # img_arr = [img_PIL_obj, ...]
    ncols = 3 # show 3 images in a row
    nrows = math.ceil(len(img_arr) / ncols)
    idx = 0
    fig = plt.figure()
    for img in img_arr:
        idx += 1
        a = fig.add_subplot(nrows, ncols, idx)
        image_plot = plt.imshow(img)
    plt.show()

def resize_retain_ratio(src_image, bg_color="white"):
    target_size = (128, 128)
    # resize the image so the longest dimension matches our target size
    src_image.thumbnail(target_size, Image.ANTIALIAS)
    # create a new square background image
    new_image = Image.new("RGB", target_size, bg_color)
    # paste the resized image into the center of the square background
    new_image.paste(src_image, (int((target_size[0] - src_image.size[0]) / 2), 
                                int((target_size[1] - src_image.size[1]) / 2)))
    # return the resized image
    return new_image
            
def read_images():
    src_folder = "./data/classification/training"
    img_arr = []
    for root, dirs, _ in os.walk(src_folder):
        for dir_name in dirs:       
            dir_path = os.path.join(root, dir_name)
            for img_name in os.listdir(dir_path):
                item_in_dir = os.path.join(dir_path, img_name)
                if (os.path.isdir(item_in_dir)):
                    continue
                # open the file using the PIL library
                image = Image.open(item_in_dir)
                # resize image
                resized_img = resize_retain_ratio(image.copy())
                img_arr.append({img_name: resized_img})
            save_imgs(img_arr, dir_name)
            img_arr = []

def save_imgs(img_arr, dir_name): # img_arr = [{img_name: resized_img}, ...]
    train_folder_path = "./data/classification/resized_training"
    format_type = "JPEG"
    folder_path = os.path.join(train_folder_path, dir_name)

    # delete the folder if it already exists
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)

    # create the folder
    os.makedirs(folder_path)
    print("ready to save images in", folder_path)

    # saving
    for img_obj in img_arr:
        name = list(img_obj.keys())[0]
        img = img_obj[name]
        file_path = os.path.join(folder_path, name)
        # Save the image
        img.save(file_path, format=format_type)
        
    print('saved in ', folder_path)
    print('-------')
    
def main0():
    read_images()

main0()

## Compare the Original and Resized Images
Add code to the following cell to view the original and resized version of the first image in each subfolder.

>**Hints**:
> - This code should be simlar to the image exploration code above, but you must load the same image file from the subfolder in the original **voc** folder and the subfolder in the **classification/training** folder.
> - This time, your subplots grid must show two images from each folder - you can arrange the images in two rows or two columns (one for original images, the other for resized images).

In [0]:
import os
import matplotlib.pyplot as plt
import math

# required to display matplotlib plots in notebooks
%matplotlib inline

def show_img(img_arr): # img_arr = [(src_image, train_image), ...]
    ncols = 2 # show 2 images in a row
    nrows = math.ceil(len(img_arr)*2 / ncols)
    idx = 0

    fig = plt.figure(figsize=(12, 12))
    for img_obj in img_arr:
        for img in img_obj:
            idx += 1
            a = fig.add_subplot(nrows, ncols, idx)
            image_plot = plt.imshow(img)
            title = "source" if idx%2!=0 else "train"
            a.set_title(title)
    plt.show()
        
def explore_images():
    src_folder = "./data/voc"
    train_folder = "./data/classification/training"
    img_arr = []
    dirs_arr = []
    for _, dirs, __ in os.walk(src_folder):
        for name in dirs:
            dirs_arr.append(name)
    for dir_name in dirs_arr:
        src_img_name = os.listdir(os.path.join(src_folder, dir_name))[0]
        train_img_name = os.listdir(os.path.join(train_folder, dir_name))[0]
        
        src_image = mp_image.imread(os.path.join(src_folder, dir_name, src_img_name))
        train_image = mp_image.imread(os.path.join(train_folder, dir_name, train_img_name))
        
        img_arr.append((src_image, train_image))
    show_img(img_arr)
    
def main():
    explore_images()

main()