In [None]:
from os.path import isfile, join
from PIL import Image
import glob
import numpy as np 
from sklearn.decomposition import PCA
import skimage as sk
from skimage import transform
from skimage import util
from skimage import io
from scipy import ndarray
import random
import pandas as pd

In [None]:
#All path should be constructed by join function

#To resize all images in a given folder path and save with the same name.
#Params
# path: folder path contains image set
# size: image will be resize to (size, size)
def resize(path, size):
    for filename in glob.glob("{}/*.jpg".format(path)):
        image = Image.open(filename)
        new_image = image.resize((size, size))
        new_image.save(filename)

#Load data from given folder path as list of image in RGB format
#Param:
# folder_path: folder path contain data set
#Can add check if file is image here but not that necessary
def load_data(folder_path):
    image_list = []
    for filename in glob.glob("{}/*.jpg".format(folder_path)):
        im = sk.io.imread(filename) #Image.open(filename)
        image_list.append(im)
        #im.close()
    return image_list

#Rotation, can change maximum rotation rate
def random_rotation(image_array: ndarray):
    random_degree = random.uniform(-45, 45)
    return sk.transform.rotate(image_array, random_degree)

def random_noise(image_array: ndarray):
    return sk.util.random_noise(image_array)

def horizontal_flip(image_array: ndarray):
    return image_array[:, ::-1]

#Augmentation image method given array like image set and save to destination folder
#Params:
# images: image set
# destination_path: path to save augmented image, as desination_path/filename.jpg
# num_files_desired: can put 3*length of image set, as there are 3 transformation
#Can still get label with new file from the folder name, but can modify to use original file name
def data_augmentation(images, destination_path, num_files_desired):
    # dictionary of the transformations
    available_transformations = {
        'rotate': random_rotation,
        'noise': random_noise,
        'horizontal_flip': horizontal_flip
    }
    
    num_generated_files = 0
    while num_generated_files <= num_files_desired:
        image_to_transform = random.choice(images)

        num_transformations_to_apply = random.randint(1, len(available_transformations))

        num_transformations = 0
        transformed_image = None
        while num_transformations <= num_transformations_to_apply:
            key = random.choice(list(available_transformations))
            transformed_image = available_transformations[key](image_to_transform)
            num_transformations += 1
        num_generated_files += 1
        new_file_path = '{}/{}.jpg'.format(destination_path, num_generated_files)

        io.imsave(new_file_path, transformed_image)

#Principle component analysis
#Params
# X: data set
# num_components: Number of component needed, None mean algo will keep everything
#Return
# Data set after transformation

def pca_processing(X, num_components=None):
    pca = PCA(n_components=num_components);
    X_transformed = pca.fit_transform(X)
    #Uncommented to test code by dumping to csv file
    #my_df = pd.DataFrame(X_transformed)
    #my_df.to_csv('pca.csv', index=False) 
    return X_transformed

    

In [None]:
train_path = "train/train"
augmentation_path = "augmented"
test_path = "test"

#Example on how to load data, resize, augmented from all folder in trains
data_original = []
data_augmented = []

for i in range(0, 41):
    folder_path = join(train_path, str(i).zfill(2))
    augmented_folder_path = join(augmentation_path, str(i).zfill(2))
    
    resize(folder_path, 300) #resize all to 300 x 300 image
    
    temp = load_data(folder_path)
    data_original += temp
    data_augmentation(temp, augmented_folder_path, len(temp)*3) #apply all 3 transformation to all images
    data_augmented += load_data(augmented_folder_path)
    
#To apply pca for testing, need to safe current PCA weights and multiply with current instance before feed to model
pca_input = np.array(data_augmented)
pca_input = pca_input.reshape(pca_input.shape[0], pca_input.shape[1]*pca_input.shape[2]*pca_input.shape[3]) #flatten image by reshape
X_transformed = pca_processing(pca_input, 7500) #reduce from 300*300*3 dimension to 50*50*3 dimension 

#Can further applytechnique like normalizing, feature scaling before give data to model input 
