# Create Training Data Dataframe

In [1]:
def CreateTrainingDataframe(index_path, label_path, broden_dir):

    """
    Creates a matrix of images and classes for training data
    
    Args:
    index_path = path to the index.csv file in the Broden dataset
    label_path = path to the label.csv file in the Broden dataset
    broden_dir = path to the Broden dataset directory
    
    Returns: matrix
    
    """
    
    import os
    import numpy as np
    from PIL import Image
    import matplotlib.pyplot as plt
    import math
    import pandas as pd
    from tqdm import tqdm_notebook
    import nbimporter
    
    from p1_GetClassesFromImage import GetClassesFromImage
    from p1_AddClassToMatrix import AddClassToMatrix
    
    
    image_dir = os.path.join(broden_dir, 'images/')

    index_df = pd.read_csv(index_path, sep=';')
    label_df = pd.read_csv(label_path, sep=';')
    
    #select only the images which are meant for training
    training_df = index_df.loc[index_df.split == 'train',:]
    
    class_list = label_df.loc[:, 'name'].tolist() # This column stores all the labels in the images

    amount_of_classes = len(class_list) # amount of columns
    amount_of_pictures = len(training_df.loc[:, 'image']) # amount of rows
    
    # create a matrix to store the pictures with each concept
    classes_matrix = np.zeros(shape=(amount_of_pictures, amount_of_classes+1)) # add an additional column
    
    
    # This is the main for loop in which every 
    for image in tqdm_notebook(training_df.loc[:, 'image']): # Loop through every image in the Broden dataset

        row = training_df.index[training_df.image == image].values # get the row number of the image in the dataframe
        
        # read the columns containing the per-pixel label images and the scene and texture class
        color_img = training_df.loc[row, 'color'].values[0] 
        object_img = training_df.loc[row, 'object'].values[0]
        part_img = training_df.loc[row, 'part'].values[0]
        material_img = training_df.loc[row, 'material'].values[0]
        scene = training_df.loc[row, 'scene']
        texture = float(training_df.loc[row, 'texture'])

        if isinstance(color_img, str): #check if the 'color' column contains a string, if yes, add the color classes to the matrix
            colors = GetClassesFromImage(color_img, image_dir)
            if np.any(colors):
                classes_matrix = AddClassToMatrix(colors, row, classes_matrix)

        if isinstance(object_img, str): #check image for object classes
            objects = GetClassesFromImage(object_img, image_dir)
            if np.any(objects):
                classes_matrix = AddClassToMatrix(objects, row, classes_matrix)

        if isinstance(part_img, str): #check image for part classes
            parts = GetClassesFromImage(part_img, image_dir)
            if np.any(parts):
                classes_matrix = AddClassToMatrix(parts, row, classes_matrix)

        if isinstance(material_img, str): #check image for material classes
            materials = GetClassesFromImage(material_img, image_dir)
            if np.any(materials): #check if the the numpy array contains any values
                classes_matrix = AddClassToMatrix(materials, row, classes_matrix)

        if not math.isnan(scene):
            # the scene will only contain one number, this can be directly linked to the matrix
            classes_matrix[row, int(scene)] = 1

        if not math.isnan(texture):
            classes_matrix[row, int(texture)] = 1
    
    # convert the matrix to a dataframe and add labels to the columns
    training_data = pd.DataFrame(classes_matrix)
    labels = label_df.loc[:, 'name'].tolist()
    labels.insert(0, 'image')    
    training_data.columns = labels
    
    images = training_df.loc[:, 'image'].tolist()
    training_data.loc[:, 'image'] = images
    return training_data