In [None]:
import pandas as pd
import numpy as np
import os
from os.path import isfile, join
from os import listdir
from shutil import copy
from sklearn.model_selection import train_test_split
import itertools 
import operator

classes_list = ['Animals',
                'Food',
                'Lifts',
                'Other',
                'People',
                'Summer activity',
                'Summer landscape',
                'Winter activity',
                'Winter landscape']

images_total = []

for i in range(0, len(classes_list)):
    images_class = []
    path = 'image_classifier/' + str(classes_list[i])
    images_class = [[image, int(image[0:-8]), i] for image in listdir(path) if isfile(join(path, image))]
    images_total = images_total + images_class
    
images_total_df = pd.DataFrame(images_total)
images_total_df.columns = ['filename', 'image_id', 'class']

In [None]:
filenames = images_total_df['filename']
image_ids = images_total_df['image_id']
classes = images_total_df['class']

In [None]:
# split the data into a training-, validation and test set

filenames_temp, filenames_test, \
image_ids_temp, image_ids_test, \
classes_temp, classes_test = train_test_split(filenames,
                                              image_ids,
                                              classes,
                                              stratify = classes,
                                              test_size = 0.2,
                                              train_size = 0.8,
                                              random_state = 0)

filenames_train, filenames_val, \
image_ids_train, image_ids_val, \
classes_train, classes_val = train_test_split(filenames_temp,
                                              image_ids_temp,
                                              classes_temp,
                                              stratify = classes_temp,
                                              test_size = 0.25,
                                              train_size = 0.75,
                                              random_state = 0)

# print the number of examples in and dimensions of each set

print ('filenames_train shape:', filenames_train.shape[0])
print ('filenames_val shape:', filenames_val.shape[0])
print ('filenames_test shape:', filenames_test.shape[0])

print ('image_ids_train shape:', image_ids_train.shape)
print ('image_ids_val shape:', image_ids_val.shape)
print ('image_ids_test shape:', image_ids_test.shape)

print ('classes_train shape:', classes_train.shape)
print ('classes_val shape:', classes_val.shape)
print ('classes_test shape:', classes_test.shape)

# check the stratification

classes_train_perc = []
classes_val_perc = []
classes_test_perc = []

for i in range(0, len(classes_list)):
    classes_train_perc_i = classes_train.value_counts()[i] / len(classes_train)
    classes_train_perc.append(classes_train_perc_i)
    
    classes_val_perc_i = classes_val.value_counts()[i] / len(classes_val)
    classes_val_perc.append(classes_val_perc_i)
    
    classes_test_perc_i = classes_test.value_counts()[i] / len(classes_test)
    classes_test_perc.append(classes_test_perc_i)

print('The distribution of classes in the training set:', "{0:.2f}%".format(classes_train_perc[0] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[1] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[2] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[3] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[4] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[5] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[6] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[7] * 100),',',
                                                          "{0:.2f}%".format(classes_train_perc[8] * 100))

print('The distribution of classes in the validation set:', "{0:.2f}%".format(classes_val_perc[0] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[1] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[2] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[3] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[4] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[5] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[6] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[7] * 100),',',
                                                            "{0:.2f}%".format(classes_val_perc[8] * 100))

print('The distribution of classes in the test set:', "{0:.2f}%".format(classes_test_perc[0] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[1] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[2] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[3] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[4] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[5] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[6] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[7] * 100),',',
                                                      "{0:.2f}%".format(classes_test_perc[8] * 100))

In [None]:
# due to the low number of images in the category 'Food' these images will be added to the category 'Other'
classes_list_new = list(classes_list)
classes_list_new[1] = 'Other'

In [None]:
# save the training data

for i in range(0, len(filenames_train)):

    destination = 'image_classifier/training_data/' + classes_list_new[classes_train.iloc[i,]]
    
    # create the folder bases on folder_path
    if not os.path.exists(destination):
        os.makedirs(destination)
    
    # copy the picture if it exists
    if os.path.isfile('image_classifier/' + classes_list[classes_train.iloc[i,]] + '/' + filenames_train.iloc[i,]):
        copy('image_classifier/' + classes_list[classes_train.iloc[i,]] + '/' + filenames_train.iloc[i,], destination)

In [None]:
# save the validation data

for i in range(0, len(filenames_val)):

    destination = 'image_classifier/validation_data/' + classes_list_new[classes_val.iloc[i,]]
    
    # create the folder bases on folder_path
    if not os.path.exists(destination):
        os.makedirs(destination)
    
    # copy the picture if it exists
    if os.path.isfile('image_classifier/' + classes_list[classes_val.iloc[i,]] + '/' + filenames_val.iloc[i,]):
        copy('image_classifier/' + classes_list[classes_val.iloc[i,]] + '/' + filenames_val.iloc[i,], destination)

In [None]:
# save the test data

for i in range(0, len(filenames_test)):

    destination = 'image_classifier/test_data/' + classes_list_new[classes_test.iloc[i,]]
    
    # create the folder bases on folder_path
    if not os.path.exists(destination):
        os.makedirs(destination)
    
    # copy the picture if it exists
    if os.path.isfile('image_classifier/' + classes_list[classes_test.iloc[i,]] + '/' + filenames_test.iloc[i,]):
        copy('image_classifier/' + classes_list[classes_test.iloc[i,]] + '/' + filenames_test.iloc[i,], destination)

In [None]:
weights = []

for i in range(0, len(classes_list_new)):
    weights_temp = []
    weights_temp = [classes_list_new[i], classes_train.value_counts()[i]]
    weights.append(weights_temp)

In [None]:
weights_aggregated = []

for key, group in itertools.groupby(sorted(weights), operator.itemgetter(0)):
    s = sum(int(t[1]) for t in group)
    
    weights_aggregated.append([key, s])

In [None]:
weights_aggregated

In [None]:
weights_aggregated_df = pd.DataFrame(weights_aggregated)
weights_aggregated_df.columns = ['Category', 'Number_samples']

In [None]:
weights_aggregated_df.to_csv('results/class_weights.csv', sep = ',', index = False)