In [None]:
import numpy as np
import pandas as pd
import pickle
import os.path
from PIL import Image
from keras.utils import np_utils

account_data_01 = pd.read_csv('results/dataset_analysis.csv', low_memory = False)
clusters = pd.read_csv('results/clusters.csv', low_memory = False)
account_data_02 = account_data_01.merge(clusters, on = 'image_id', how = 'inner')

print('Number of samples in account_data_01:', account_data_01.shape[0])
print('Number of samples in account_data_02:', account_data_02.shape[0])

In [None]:
image_list = account_data_02.values.tolist()

picture_size = 256

picture_batches = [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, len(image_list)]

for i in range(1, len(picture_batches)):

    X = np.array([])
    Y = np.array([], dtype = 'int32')
    clusters = np.array([], dtype = 'int32')
    image_ids = np.array([], dtype = 'int32')

    for j in range(picture_batches[i - 1], picture_batches[i]):

        # define the location of the pictures of a particular resort
        folder_path = 'images/{}/'.format(str(image_list[j][4]))

        # import the picture if it exists
        if os.path.isfile(folder_path + '/crop/' + str(image_list[j][5]) + '_' + str(picture_size) + '.jpg'):
            img = Image.open(folder_path + '/crop/' + str(image_list[j][5]) + '_' + str(picture_size) + '.jpg')

            # convert picture to an array, reshape to the right dimensions for input in Keras and normalize the values
            image = np.array(img, dtype = 'uint8').reshape(1, picture_size, picture_size, 3)

            del img

            # add the pictures together in a matrix
            X = np.vstack([X, image]) if X.size else image

            del image

            # get the target variable
            likes_group = (int(image_list[j][0][0]))
            Y = np.append([Y], likes_group)

            del likes_group

            # save the cluster the picture belongs to
            cluster = int(image_list[j][-1])
            clusters = np.append([clusters], cluster)

            del cluster

            # save the image_id as a reference
            image_id = int(image_list[j][5])
            image_ids = np.append([image_ids], image_id)

            del image_id

            # print the progress of the import
            if (j > 0) and (j%1000) == 0:
                print(j, 'images have been processed')

    # make image_ids, clusters and Y 2-dimensional
    image_ids = image_ids.reshape(-1, 1)
    clusters = clusters.reshape(-1, 1)
    Y = Y.reshape(-1, 1)

    # np.utils.to_categorical is used to convert array of labeled data (from 0 to nb_classes-1) to one-hot vector
    Y_one_hot_encoded = np_utils.to_categorical(Y)[:, 1:6]
    
    # save datasets as pickle file
    pickleX = open('results/CNN_X_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'wb')
    pickle.dump(X, pickleX, -1)
    pickleX.close()

    pickleY = open('results/CNN_Y_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'wb')
    pickle.dump(Y, pickleY, -1)
    pickleY.close()

    pickleY_one_hot_encoded = open('results/CNN_Y_one_hot_encoded_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'wb')
    pickle.dump(Y_one_hot_encoded, pickleY_one_hot_encoded, -1)
    pickleY_one_hot_encoded.close()

    pickleimage_ids = open('results/CNN_image_ids_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'wb')
    pickle.dump(image_ids, pickleimage_ids, -1)
    pickleimage_ids.close()

    pickleclusters = open('results/CNN_clusters_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'wb')
    pickle.dump(clusters, pickleclusters, -1)
    pickleclusters.close()
    
    del X
    del Y
    del Y_one_hot_encoded
    del image_ids
    del clusters

In [None]:
image_list = account_data_02.values.tolist()

X = np.array([])
Y = np.array([], dtype = 'int32')
Y_one_hot_encoded = np.array([], dtype = 'int32')
clusters = np.array([], dtype = 'int32')
image_ids = np.array([], dtype = 'int32')

picture_batches = [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, len(image_list)]

for i in range(1, len(picture_batches)):

    pickleX = open('results/CNN_X_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'rb')
    X_temp = pickle.load(pickleX)
    pickleX.close()

    pickleY = open('results/CNN_Y_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'rb')
    Y_temp = pickle.load(pickleY)
    pickleY.close()

    pickleY_one_hot_encoded = open('results/CNN_Y_one_hot_encoded_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'rb')
    Y_one_hot_encoded_temp = pickle.load(pickleY_one_hot_encoded)
    pickleY_one_hot_encoded.close()

    pickleimage_ids = open('results/CNN_image_ids_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'rb')
    image_ids_temp = pickle.load(pickleimage_ids)
    pickleimage_ids.close()

    pickleclusters = open('results/CNN_clusters_' + str(picture_size) + '_' + str(picture_batches[i - 1]) + '_' + str(picture_batches[i]) + '.p', 'rb')
    clusters_temp = pickle.load(pickleclusters)
    pickleclusters.close()
                          
    X = np.vstack([X, X_temp]) if X.size else X_temp
    
    del X_temp

    Y = np.append([Y], Y_temp)
    
    del Y_temp

    Y_one_hot_encoded = np.vstack([Y_one_hot_encoded, Y_one_hot_encoded_temp]) if Y_one_hot_encoded.size else Y_one_hot_encoded_temp
    
    del Y_one_hot_encoded_temp

    image_ids = np.append([image_ids], image_ids_temp)

    del image_ids_temp
    
    clusters = np.append([clusters], clusters_temp)
    
    del clusters_temp

Y = Y.reshape(-1, 1)
image_ids = image_ids.reshape(-1, 1)
clusters = clusters.reshape(-1, 1)