# Train Test Split - Creation of train & test datasets

Tries to mimic the train_test_split function from scikit-learn, to work on images located in a folder

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-required-Libraries" data-toc-modified-id="Import-required-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import required Libraries</a></span></li><li><span><a href="#Function-definition" data-toc-modified-id="Function-definition-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Function definition</a></span></li><li><span><a href="#Example-usage" data-toc-modified-id="Example-usage-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Example usage</a></span></li><li><span><a href="#Merge-folders,-if-required" data-toc-modified-id="Merge-folders,-if-required-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Merge folders, if required</a></span></li></ul></div>

## Import required Libraries

In [1]:
from sklearn.model_selection import train_test_split
import os
import numpy as np

import pandas as pd
import shutil

from keras_preprocessing.image.utils import _iter_valid_files
from keras_preprocessing.image.directory_iterator import DirectoryIterator

## Function definition

Helper Functions to get list of images in a path, and rename them

In [4]:
def get_valid_images_in_path(path):
    """
    Return a list of valid images in a path
    
    Parameters
    ----------
        
        path: str
            Location where the images are stored
            
    Returns
    -------
        
        list of str: List of images obtained from the directory
        
    """
    from keras_preprocessing.image.utils import _iter_valid_files
    from keras_preprocessing.image.directory_iterator import DirectoryIterator
    
    list_files = list(
        _iter_valid_files(path,
                          DirectoryIterator.white_list_formats,
                          follow_links=False))
    list_files = list(map(lambda x: os.path.join(*x), list_files))
    return list_files

Helper function to create train & test splits of images

In [5]:
def train_test_split_images(path_input,
                            path_output_train,
                            path_output_test,
                            stratify=False,
                            **kwargs):
    """
    Used to create train_test splits of images found in directories, and store them in a particular location. 
    
    
    Parameters
    ----------
        
        path_input: str
            Location where the input images are stored. The folder structure for identifying images across classes should be same as that expected by keras.preprocessing.image.ImageDataGenerator
        path_output_train: str
            Location where the train image set will be placed after creating train/test splits from the input
        path_output_test: str
            Location where the train image set will be placed after creating train/test splits from the input
        stratify: bool
            Whether the split of train and test should be based on stratified sampling, to account for the volume proportions of available classes (Defaults to False, i.e., random sampling)
        
        **kwargs: dict like
            All other parameters passed to sklearn's train_test_split method.
                - test_size : float, int or None, optional (default=0.25)
                - train_size : float, int, or None, (default=None)
                - random_state : int, RandomState instance or None, optional (default=None)
                - shuffle : boolean, optional (default=True)
    """
    # Check if path_output_train and path_output_test exist. If not, they are created

    for path_op in [path_output_train, path_output_test]:
        if not os.path.exists(path_op):
            os.mkdir(path_op)
            print("Path created: %s" % path_op)

    # Identify the list of available categories/classes
    classes = []
    for subdir in sorted(os.listdir(path_input)):
        if os.path.isdir(os.path.join(path_input, subdir)):
            classes.append(subdir)

    # Create a dataframe containing the list of images belonging to each class (Identified from the specified location)
    list_valid_files = []
    i = 0
    for dirpath in (os.path.join(path_input, subdir) for subdir in classes):
        #     print(os.listdir(dirpath))
        list_files = get_valid_images_in_path(dirpath)        
        list_valid_files.extend(
            list(zip(np.repeat(classes[i], len(list_files)), list_files)))
        i += 1

    data_valid_files = pd.DataFrame(
        list_valid_files,
        columns=['class', 'path'])  # Store the list_valid_files in a dataframe

    # Create Train-Test splits from the dataframe
    flag_stratify = data_valid_files['class'] if (stratify == True) else None
    X_train, X_test, y_train, y_test = train_test_split(
        data_valid_files['path'],
        data_valid_files['class'],
        stratify=flag_stratify,
        **kwargs)
    # print("Training Data:\n\n",pd.Series(y_train).value_counts())
    # print("Test Data:\n\n",pd.Series(y_test).value_counts())

    # Create a dataframe to store all the configuration variables required for copying files
    data_result = pd.concat([
        pd.DataFrame({
            'path': X_train,
            'class': y_train,
            'type': np.repeat('train', len(X_train))
        }),
        pd.DataFrame({
            'path': X_test,
            'class': y_test,
            'type': np.repeat('test', len(X_test))
        })
    ])
    # Identify the output paths for each image
    data_result['path_output_base'] = np.where(data_result['type'] == 'train',
                                               path_output_train,
                                               path_output_test)
    data_result['path_output'] = data_result.apply(lambda x: os.path.join(
        x['path_output_base'], x['class'], os.path.basename(x['path'])),
                                                   axis=1)

    # Ensure that the required o/p directories exist. If not, they will be created
    for feature in data_result['class'].unique():
        for path_op in [path_output_train, path_output_test]:
            folder = os.path.join(path_op, feature)
            if not os.path.exists(folder):
                os.mkdir(folder)

    # Copy the files to the respective locations
    data_result.apply(lambda x: shutil.copyfile(x['path'], x['path_output']),
                      axis=1)
    print(
        "Samples have been created. \n\nSummary\n------\n\nTotal number of samples:%s. \nVolumes across categories:\n"
        % data_result.shape[0],
        data_result.groupby(['type', 'class']).size())

## Example usage

In [6]:
# Folders where the input files must be present & output files must be placed. Should be valid paths 
PATH_INPUT = r'/Users/shyamravikumar/Documents/Workspace/BAI - Project/Input Files/V4/2. Renamed Images'
PATH_OUTPUT = r'/Users/shyamravikumar/Documents/Workspace/BAI - Project/Input Files/V4/3. Train Test Split'

# Train and test folders will be created inside these paths, containing the required images
PATH_OUTPUT_TRAIN = os.path.join(PATH_OUTPUT,'train')
PATH_OUTPUT_TEST = os.path.join(PATH_OUTPUT,'test')

Ensure the paths are valid

In [7]:
check_exists_path = lambda x: os.path.exists(x)
assert(check_exists_path(PATH_INPUT))
assert(check_exists_path(PATH_OUTPUT))

Run the train test split

In [8]:
train_test_split_images(
    path_input=PATH_INPUT,
    path_output_train=PATH_OUTPUT_TRAIN,
    path_output_test=PATH_OUTPUT_TEST,
    stratify=True,
    train_size=0.6,
    random_state=12334  
)

Path created: /Users/shyamravikumar/Documents/Workspace/BAI - Project/Input Files/V4/3. Train Test Split/train
Path created: /Users/shyamravikumar/Documents/Workspace/BAI - Project/Input Files/V4/3. Train Test Split/test




Samples have been created. 

Summary
------

Total number of samples:149. 
Volumes across categories:
 type   class                
test   LAV-P1                    4
       LAV-P2                    2
       LAV-P3                    3
       LDL-P1                    1
       LDL-P2                    4
       LDL-P3                    1
       LDL-P4                    2
       Not Defective            28
       Not Defective Bubbles    15
train  LAV-P1                    6
       LAV-P2                    2
       LAV-P3                    4
       LDL-P1                    2
       LDL-P2                    7
       LDL-P3                    3
       LDL-P4                    2
       Not Defective            41
       Not Defective Bubbles    22
dtype: int64
