In [None]:
import pandas as pd
import numpy as np
import pickle

from matplotlib import pyplot as plt
import matplotlib.cm as cm

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler

import os.path
from PIL import Image
import scipy.misc
import cv2

# cv2.calcHist has the following parameters:(images, number of channels, mask, histSize, ranges[, hist[, accumulate]])
#    - images:              it is the source image of type uint8 or float32, it should be given in square brackets
#    - number  of channels: it is also given in square brackets, it is the index of channel for which we calculate histogram
#                                - if input is grayscale image, its value is [0]
#                                - if input is color image, you can pass [0], [1] or [2] to calculate histogram of blue,
#                                  green or red channel
#    - mask:                mask image, to find histogram of full image, it is given as "None" but if you want to find
#                           histogram of particular region of image, you have to create a mask image for that and give
#                           it as mask
#    - histSize:            this represents our BIN count, it should be given in square brackets (full scale -> [256]) 
#    - ranges:              this is our RANGE, normally it is [0, 256].

# first we create a matrix with features, in this case the histogram of each picture

account_data_01 = pd.read_csv('results/dataset_analysis.csv', low_memory = False)

print('Number of records in account_data_01:', account_data_01.shape[0])

In [None]:
# let's create a list with only image_id and accountname for each picture
image_list = account_data_01[['image_id', 'accountname']].values.tolist()

X = np.array([])
Y = np.array([])

# import a picture and calculate the histogram for the different channels
for j in range(0, len(image_list)):
    
    # define the location of the pictures of a particular resort
    folder_path = 'images/{}/'.format(str(image_list[j][1]))

    # import the picture if it exists
    if os.path.isfile(folder_path + '/crop/' + str(image_list[j][0]) + str('_256.jpg')):
        img = Image.open(folder_path + '/crop/' + str(image_list[j][0]) + str('_256.jpg'))
        
        # convert picture to an array
        arr = np.array(img)
        
        # calculate the histograms
        hist = cv2.calcHist([arr], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]).flatten().reshape(1, -1)
        
        # adding the concatenated histograms of this picture to the other ones
        X = np.vstack([X, hist]) if X.size else hist
        
        # saving the image_id corresponding to the histograms for later use
        image_id = int(image_list[j][0])
        Y = np.append([Y], image_id)

        # print the progress of the import
        if (j > 0) and (j%5000) == 0:
            print(j, 'images have been processed')

# make Y 2-dimensional
Y = Y.reshape(-1, 1)

In [None]:
X.shape, Y.shape

In [None]:
# save datasets as pickle file
pickleX = open('results/cluster_X_3D_8bins.p', 'wb')
pickle.dump(X, pickleX, -1)
pickleX.close()

pickleY = open('results/cluster_Y_3D_8bins.p', 'wb')
pickle.dump(Y, pickleY, -1)
pickleY.close()

In [None]:
# import pickle files
pickleX = open('results/cluster_X_3D_8bins.p', 'rb')
X = pickle.load(pickleX)
pickleX.close()

pickleY = open('results/cluster_Y_3D_8bins.p', 'rb')
Y = pickle.load(pickleY)
pickleY.close()

In [None]:
X.shape, Y.shape

In [None]:
# randomly select 50% of the dataset, KMeans will crash on the full dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, train_size = 0.5, random_state = 0)
X_train.shape, Y_train.shape

In [None]:
# let's select two pictures, a typical non winter picture and a typical winter picture, manually
(Y == 3900412).sum(), (Y == 3400094).sum(), np.where(Y == 3900412)[0][0], np.where(Y == 3400094)[0][0]

In [None]:
# example of a typical non-winter picture
image_list = account_data_01[['image_id', 'accountname']].values.tolist()

non_winter_id = 8186
winter_id = 39640

folder_path = 'images/{}/'.format(str(image_list[non_winter_id][1]))
img_non_winter = Image.open(folder_path + '/crop/' + str(image_list[non_winter_id][0]) + str('_256.jpg'))
arr_non_winter = np.array(img_non_winter)

print('Picture non-winter:', arr_non_winter.shape, Y[non_winter_id])
                 
plt.imshow(arr_non_winter)
plt.show()

folder_path = 'images/{}/'.format(str(image_list[winter_id][1]))
img_winter = Image.open(folder_path + '/crop/' + str(image_list[winter_id][0]) + str('_256.jpg'))
arr_winter = np.array(img_winter)

print('Picture winter:', arr_winter.shape, Y[winter_id])
                 
plt.imshow(arr_winter)
plt.show()

In [None]:
# let's check the histogram distributions of two different pictures
observations = np.arange(1, 513, 1)

plt.figure()
fig = plt.gcf()
fig.set_size_inches(12, 8)

non_winter, = plt.plot(observations, X[non_winter_id, :], color = '#FF9933', linewidth = 0.75, linestyle = '-')
winter, = plt.plot(observations, X[winter_id, :], color = '#0099FF', linewidth = 0.75, linestyle = '-')

plt.title(('\nHistogram distributions of two pictures\n'), fontsize = 14)
plt.xlabel('color bins across 3 channels (BGR)', fontsize = 11, labelpad = 10)
plt.ylabel('Number of pixels', fontsize = 11)

legend = plt.legend([non_winter, winter],
               ['non-winter picture',
                'winter picture'],
                loc = 2,
                facecolor = 'white',
                edgecolor = 'black',
                borderaxespad = 1)

plt.show()

In [None]:
# create two initial centroids: one for summer and one for winter pictures
init_clusters = np.vstack((X[non_winter_id], X[winter_id]))

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
scaler = StandardScaler().fit(X_train)
X_sample_standardized = scaler.transform(X_train)

range_n_clusters = [2]

for n_clusters in range_n_clusters:
    
    # create a subplot with 1 row and 2 columns
    plt.figure()
    fig = plt.gcf()
    fig.set_size_inches(10, 7)

    ax = plt.gca()
    
    # the 1st subplot is the silhouette plot
    # the silhouette coefficient can range from -1, 1 but let's focus on the following range: [-0.2, 1]
    ax.set_xlim([-0.2, 1])
    
    # the (n_clusters + 1) * 10 is for inserting blank space between silhouette plots of individual clusters,
    ax.set_ylim([0, len(X_sample_standardized) + (n_clusters + 1) * 10])
    
    np.random.seed(42)
    
    clusterer = KMeans(n_clusters = n_clusters,
                       algorithm = 'elkan',
                       max_iter = 100,
                       init = 'k-means++',
                       n_init = 10)
    
    kmeans = clusterer.fit_predict(X_sample_standardized)

    # the silhouette_score gives the average value for all the samples
    #    -> this gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(X_sample_standardized, kmeans)
    
    # compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X_sample_standardized, kmeans)
    
    y_lower = 10
    
    for i in range(n_clusters):
        
        # aggregate the silhouette scores for samples belonging to cluster i and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[kmeans == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0,
                         ith_cluster_silhouette_values,
                         facecolor = color,
                         edgecolor = color,
                         alpha = 0.5)

        # label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title(('\nSilhouette analysis for KMeans clustering with %i clusters\n' %n_clusters), fontsize = 14)
    ax.set_xlabel('Silhouette coefficient values', fontsize = 11, labelpad = 10)
    ax.set_ylabel('Cluster label', fontsize = 11)

    # the vertical line for average silhouette score of all the values
    ax.axvline(x = silhouette_avg, color = 'red', linestyle = '--')

    # clear the yaxis labels / ticks
    ax.set_yticks([])
    ax.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
    ax.grid(color = 'grey', linestyle = '--', linewidth = 0.25)
    plt.show()
    
    filename = 'results/silhouette_plot.png'
    fig.savefig(filename)
    
    print('For', n_clusters, 'clusters the average silhouette_score is:', silhouette_avg)

In [None]:
clusters = clusterer.fit(X_sample_standardized).labels_.reshape(-1, 1)
unique, counts = np.unique(clusters, return_counts = True)

print('The number of samples in each cluster:\n', np.asarray((unique, counts)).T)

In [None]:
clusters = clusterer.labels_.reshape(-1, 1)
clusters

In [None]:
# samples in cluster 1
Y_train[np.where(clusters == 1)]

In [None]:
# find the indices of some of these pictures
np.where(Y == 6800766)[0][0], np.where(Y == 5200776)[0][0], np.where(Y == 401550)[0][0]

In [None]:
# let's see these pictures
cluster_1_image_ids = [20451, 48420, 32103]

for cluster_1_image_id in cluster_1_image_ids:

    folder_path = 'images/{}/'.format(str(image_list[cluster_1_image_id][1]))
    img = Image.open(folder_path + '/crop/' + str(image_list[cluster_1_image_id][0]) + str('_256.jpg'))
    arr = np.array(img)

    print('Picture example cluster 0:', arr.shape, Y[cluster_1_image_id])

    plt.imshow(arr)
    plt.show()

In [None]:
# samples in cluster 0
Y_train[np.where(clusters == 0)]

In [None]:
# find the indices of some of these pictures
np.where(Y == 2600232)[0][0], np.where(Y == 1401150)[0][0], np.where(Y == 3100120)[0][0]

In [None]:
# let's see these pictures
cluster_0_image_ids = [41748, 49528, 2732]

for cluster_0_image_id in cluster_0_image_ids:

    folder_path = 'images/{}/'.format(str(image_list[cluster_0_image_id][1]))
    img = Image.open(folder_path + '/crop/' + str(image_list[cluster_0_image_id][0]) + str('_256.jpg'))
    arr = np.array(img)

    print('Picture example cluster 1:', arr.shape, Y[cluster_0_image_id])

    plt.imshow(arr)
    plt.show()

In [None]:
# the cluster analysis is done on a random sample of 50% of the full dataset, let's assign every sample
# to a cluster and save the results

clusters_full_dataset = clusterer.predict(X).reshape(-1, 1)
unique, counts = np.unique(clusters_full_dataset, return_counts = True)

print('The number of samples in each cluster:\n', np.asarray((unique, counts)).T)

In [None]:
cluster_df = pd.DataFrame(np.hstack((clusters_full_dataset, Y)))
cluster_df.columns = ['cluster', 'image_id']
cluster_df.to_csv('results/clusters.csv', sep = ',', index = False)