In [None]:
import numpy as np
import pandas as pd
import pickle
import itertools
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

np.random.seed(1337)

account_data_01 = pd.read_csv('results/dataset_analysis.csv', low_memory = False)
clusters = pd.read_csv('results/clusters.csv', low_memory = False)
account_data_02 = account_data_01.merge(clusters, on = 'image_id', how = 'inner')

total_predictions = pd.read_csv('total_predictions.csv', low_memory = False)
total_predictions = total_predictions[['image_id', 'y_hat']]
total_predictions = pd.get_dummies(total_predictions, columns = ['y_hat'])

account_data_03 = account_data_02.merge(total_predictions, on = 'image_id', how = 'inner')

print('Number of samples in account_data_01:', account_data_01.shape[0])
print('Number of samples in account_data_02:', account_data_02.shape[0])
print('Number of samples in account_data_03:', account_data_03.shape[0])

In [None]:
# import image_ids in the training, validation and test set
image_ids_train = pd.read_csv('results/image_ids_train.csv', low_memory = False)
image_ids_val = pd.read_csv('results/image_ids_val.csv', low_memory = False)
image_ids_test = pd.read_csv('results/image_ids_test.csv', low_memory = False)

In [None]:
# construct a training, validation and test set
total_train = account_data_03.merge(image_ids_train, on = 'image_id', how = 'inner')
total_val = account_data_03.merge(image_ids_val, on = 'image_id', how = 'inner')
total_test = account_data_03.merge(image_ids_test, on = 'image_id', how = 'inner')

In [None]:
image_ids_test2 = total_test[['image_id']]

In [None]:
# let's create a feature matrix and a target variable
X_train = total_train.drop(['likes_groups', 'log_likes_image_corrected', 'likes_image_corrected', 'cluster', 'image_id', 'resort', 'accountname'], axis = 1)
X_val = total_val.drop(['likes_groups', 'log_likes_image_corrected', 'likes_image_corrected', 'cluster', 'image_id', 'resort', 'accountname'], axis = 1)
X_test = total_test.drop(['likes_groups', 'log_likes_image_corrected', 'likes_image_corrected', 'cluster', 'image_id', 'resort', 'accountname'], axis = 1)

Y_train = total_train['likes_groups'].apply(lambda x: int(str(x)[0]))
Y_val = total_val['likes_groups'].apply(lambda x: int(str(x)[0]))
Y_test = total_test['likes_groups'].apply(lambda x: int(str(x)[0]))

In [None]:
X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape

In [None]:
features_relevant_df = pd.read_csv('results/features_relevant_logistic_regression.csv', low_memory = False)
nested_features_relevant_list = features_relevant_df.values.tolist()

features_relevant_list = [item for sublist in nested_features_relevant_list for item in sublist]
extra_features = ['y_hat_0', 'y_hat_1', 'y_hat_2', 'y_hat_3', 'y_hat_4', 'y_hat_5', 'y_hat_6', 'y_hat_7']
features_list = features_relevant_list + extra_features
len(features_list)

In [None]:
# let's scale the data (mean of zero and standard deviation of one) as input to a logistic regression for
# maximum interpretability of the results and perform a sanity check to see whether each feature has in fact
# a mean of zero and standard deviation one

X_train_selection = X_train[features_list]
X_val_selection = X_val[features_list]
X_test_selection = X_test[features_list]

scaler = StandardScaler().fit(X_train_selection)

X_train_selection_scaled = scaler.transform(X_train_selection)
X_val_selection_scaled = scaler.transform(X_val_selection)
X_test_selection_scaled = scaler.transform(X_test_selection)

print('Information about the training set:\n')
print('   - datatype:', X_train_selection_scaled.dtype)
print('   - shape of the dataset:', X_train_selection_scaled.shape)
print('   - sum of the means of the columns:', round(X_train_selection_scaled.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(X_train_selection_scaled.std(axis = 0).sum(), 2))
print('\n')
print('Information about the validation set:\n')
print('   - datatype:', X_val_selection_scaled.dtype)
print('   - shape of the dataset:', X_val_selection_scaled.shape)
print('   - sum of the means of the columns:', round(X_val_selection_scaled.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(X_val_selection_scaled.std(axis = 0).sum(), 2))
print('\n')
print('Information about the test set:\n')
print('   - datatype:', X_test_selection_scaled.dtype)
print('   - shape of the dataset:', X_test_selection_scaled.shape)
print('   - sum of the means of the columns:', round(X_test_selection_scaled.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(X_test_selection_scaled.std(axis = 0).sum(), 2))

In [None]:
# train a logistic regression classifier and output the accuracy for both the training and validation set

clf = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', random_state = 0).fit(X_train_selection_scaled, Y_train)

# accuracy for the training set
Y_train_pred = clf.predict(X_train_selection_scaled)
accuracy_train = accuracy_score(Y_train, Y_train_pred)

# accuracy for the validation set
Y_val_pred = clf.predict(X_val_selection_scaled)
accuracy_validation = accuracy_score(Y_val, Y_val_pred)

print('Accuracy on the training set:', "{0:.2f}".format(accuracy_train))
print('Accuracy on the validation set:', "{0:.2f}".format(accuracy_validation))

In [None]:
# accuracy for the test set
Y_test_pred = clf.predict(X_test_selection_scaled)
accuracy_test = accuracy_score(Y_test, Y_test_pred)

print('Accuracy on the test set:', "{0:.2f}".format(accuracy_test))

In [None]:
coefficients = pd.DataFrame(np.column_stack((X_train_selection.columns.values, clf.coef_.T)))
coefficients.columns = ['feature', '1 - HH', '2 - H', '3 - M', '4 - L', '5 - LL']

In [None]:
coefficients

## Deep Learning

In [None]:
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Activation
from keras.initializers import he_uniform, glorot_uniform
from keras.optimizers import Adam
from keras.utils import np_utils, plot_model
from keras.utils.vis_utils import model_to_dot

features = Sequential()
features.add(Dense(64, input_shape = (X_train_selection_scaled.shape[1],), kernel_initializer = he_uniform(), name = 'dense1'))
features.add(Activation('relu', name = 'activation1'))
features.add(Dense(64, kernel_initializer = he_uniform(), name = 'dense2'))
features.add(Activation('relu', name = 'activation2'))
features.add(Dense(64, kernel_initializer = he_uniform(), name = 'dense3'))
features.add(Activation('relu', name = 'activation3'))
features.add(Dense(64, kernel_initializer = he_uniform(), name = 'dense4'))
features.add(Activation('relu', name = 'activation4'))
features.add(Dense(64, kernel_initializer = he_uniform(), name = 'dense5'))
features.add(Activation('relu', name = 'activation5'))
features.add(Dense(5, kernel_initializer = glorot_uniform(), name = 'dense6'))
features.add(Activation('softmax', name = 'softmax'))

In [None]:
features.summary()

In [None]:
Y_train_one_hot = np_utils.to_categorical(Y_train)[:, 1:6]
Y_val_one_hot = np_utils.to_categorical(Y_val)[:, 1:6]
Y_test_one_hot = np_utils.to_categorical(Y_test)[:, 1:6]

In [None]:
features.compile(optimizer = Adam(lr = 1e-3), loss = "categorical_crossentropy", metrics = ["accuracy"])

history_epochs_0_5 = features.fit(x = X_train_selection_scaled,
                                  y = Y_train_one_hot,
                                  epochs = 5,
                                  batch_size = 16,
                                  validation_data = (X_val_selection_scaled, Y_val_one_hot))

In [None]:
features.compile(optimizer = Adam(lr = 1e-4), loss = "categorical_crossentropy", metrics = ["accuracy"])

history_epochs_5_10 = features.fit(x = X_train_selection_scaled,
                                   y = Y_train_one_hot,
                                   epochs = 5,
                                   batch_size = 16,
                                   validation_data = (X_val_selection_scaled, Y_val_one_hot))

In [None]:
features.compile(optimizer = Adam(lr = 1e-5), loss = "categorical_crossentropy", metrics = ["accuracy"])

history_epochs_10_15 = features.fit(x = X_train_selection_scaled,
                                    y = Y_train_one_hot,
                                    epochs = 5,
                                    batch_size = 16,
                                    validation_data = (X_val_selection_scaled, Y_val_one_hot))

In [None]:
score = features.evaluate(x = X_test_selection_scaled, y = Y_test_one_hot)

print('\nLoss:', "{0:.3f}".format(score[0]))
print('Test accuracy:', "{0:.2f}%".format(score[1] * 100))

In [None]:
Y_test_pred = []

for i in range(0, X_test_selection_scaled.shape[0]):
    pred_i = np.argmax(features.predict(X_test_selection_scaled[i].reshape(1, 51))) + 1
    Y_test_pred.append(pred_i)

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

# visualizing the development of the loss and accuracy (of the last batch in every epoch) with respect to the epoch

plt.figure()
fig = plt.gcf()
fig.set_size_inches(10, 8, forward = True)
    
# custom line colors
color_accuracy = '#FF9933'
color_loss = '#0099FF'

# number of epochs
x = np.arange(1, 16, 1)

plt.axvline(5, color = '#999999', linestyle = '--', zorder = 0)
plt.axvline(10, color = '#999999', linestyle = '--', zorder = 0)

# extract the series from the history
y1 = history_epochs_0_5.history['acc'] + history_epochs_5_10.history['acc'] + history_epochs_10_15.history['acc']
y2 = history_epochs_0_5.history['val_acc'] + history_epochs_5_10.history['val_acc'] + history_epochs_10_15.history['val_acc']
y3 = history_epochs_0_5.history['loss'] + history_epochs_5_10.history['loss'] + history_epochs_10_15.history['loss']
y4 = history_epochs_0_5.history['val_loss'] + history_epochs_5_10.history['val_loss'] + history_epochs_10_15.history['val_loss']

ax1 = plt.gca()

# plot the accuracy series
accuracy_train, = plt.plot(x, y1, color_accuracy, linewidth = 0.75, linestyle = '-', zorder = 3)
accuracy_validation, = plt.plot(x, y2, color_accuracy, linewidth = 0.75, linestyle = '--', zorder = 3)

ax1.set_ylim([0, 1])
yticks_major = np.round(np.linspace(0, 1, 11), 1)
yticks_major_str = (yticks_major * 100).astype(int).astype(str).tolist()
yticks_labels = [x + ' %' for x in yticks_major_str]
ax1.set_yticks(yticks_major)
ax1.set_yticklabels(yticks_labels, fontsize = 10)

ax1.set_xlabel('epoch', fontsize = 11, labelpad = 10)
ax1.set_ylabel('accuracy', fontsize = 11)

ax2 = ax1.twinx()

# plot the accuracy series
loss_train, = plt.plot(x, y3, color_loss, linewidth = 0.75, linestyle = '-', zorder = 3)
loss_validation, = plt.plot(x, y4, color_loss, linewidth = 0.75, linestyle = '--', zorder = 3)

ax2.set_ylim([0, 3])
ax2.set_ylabel('loss', fontsize = 11)
ax1.grid(color = '#333333', linestyle = '--', linewidth = 0.25, zorder = 1)

xticks_major = np.round(np.linspace(1, 19, 10), 2)
ax1.set_xticks(xticks_major)
ax1.set_xlim([0, 16])

plt.title('\nAccuracy and Loss\n', fontsize = 14)

plt.text(1, 1.35, 'Adam with lr = 1e-3', fontsize = 12, color = '#666666', multialignment = 'center')

plt.annotate("",
             xy = (0.25, 1.5),
             xytext = (4.75, 1.5),
             arrowprops = dict(arrowstyle = "<->", facecolor = '#666666'))

plt.text(6, 1.35, 'Adam with lr = 1e-4', fontsize = 12, color = '#666666', multialignment = 'center')

plt.annotate("",
             xy = (5.25, 1.5),
             xytext = (9.75, 1.5),
             arrowprops = dict(arrowstyle = "<->", facecolor = '#666666'))

plt.text(11, 1.35, 'Adam with lr = 1e-5', fontsize = 12, color = '#666666', multialignment = 'center')

plt.annotate("",
             xy = (10.25, 1.5),
             xytext = (14.75, 1.5),
             arrowprops = dict(arrowstyle = "<->", facecolor = '#666666'))

plt.legend([accuracy_train, accuracy_validation, loss_train, loss_validation],
               ['accuracy training set',
                'accuracy validation set',
                'loss training set',
                'loss validation set'],
                loc = 2,
                facecolor = 'white',
                edgecolor = 'black',
                borderaxespad = 1)

plt.show()

filename = 'results/accuracy_loss_final.png'
fig.savefig(filename)

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# create confusion matrices, with and without normalization

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

def plot_confusion_matrix(cm, classes, normalize = False, title = 'Confusion matrix', cmap = plt.cm.Blues):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)

    plt.colorbar()
    if normalize:
        plt.clim(-0, 1)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 fontsize = 20,
                 horizontalalignment = "center",
                 color = "white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(Y_test, Y_test_pred)
np.set_printoptions(precision = 2)

class_names = ['HH',
               'H',
               'M',
               'L',
               'LL']

# Plot non-normalized confusion matrix
plt.figure(figsize = (10, 10))
plot_confusion_matrix(cnf_matrix, classes = class_names, title = 'Confusion matrix, without normalization')

accuracy = 0

for i in range(0, len(class_names)):
    accuracy = accuracy + cnf_matrix[i, i]

print('\nThe accuracy is:', accuracy / cnf_matrix.sum(), '\n')

# Plot normalized confusion matrix
plt.figure(figsize = (10, 10))
plot_confusion_matrix(cnf_matrix, classes = class_names, normalize = True, title = 'Normalized confusion matrix')

accuracy = 0

for i in range(0, len(class_names)):
    accuracy = accuracy + cnf_matrix[i, i]

print('\nThe accuracy is:', accuracy / cnf_matrix.sum())

plt.show()

In [None]:
# let's add the predictions to the features to analyse a bit more

predictions = []

for image_id, pred in zip(image_ids_test2['image_id'].values, Y_test_pred):
    prediction = image_id, pred
    predictions.append(prediction)

predictions_df = pd.DataFrame(predictions)
predictions_df.columns = ['image_id', 'prediction']

In [None]:
# add the other features and select the relevant ones for this purpose
predictions_df = account_data_03.merge(predictions_df, on = 'image_id', how = 'inner')
predictions_df = predictions_df[['image_id', 'resort', 'cluster', 'likes_groups', 'prediction']]
predictions_df['likes_groups'] = predictions_df['likes_groups'].apply(lambda x: int(str(x)[0]))

In [None]:
predictions_df

In [None]:
# create an overview of the predictions versus the original categories per resort
overview = predictions_df.groupby(['resort', 'likes_groups', 'prediction']).image_id.count().reset_index().rename(columns = {'image_id': 'count'})
overview['correct'] = np.where(overview['prediction'] == overview['likes_groups'], 1, 0)

In [None]:
sum_per_group = overview[['resort', 'correct', 'count']].groupby(['resort', 'correct']).sum().reset_index().rename(columns = {'count': 'count_per_group'})
sum_total = overview[['resort', 'count']].groupby(['resort']).sum().reset_index().rename(columns = {'count': 'total'})
percentage = sum_per_group.merge(sum_total, on = 'resort', how = 'outer')
percentage['percentage'] = 100 * percentage['count_per_group'] / percentage['total']

In [None]:
percentage_top10 = percentage[(percentage['total'] >= 100) & (percentage['correct'] == 1)].nlargest(10, 'percentage') \
                    .sort_values(by = 'percentage', ascending = False).set_index('resort')
percentage_top10

In [None]:
percentage_worst10 = percentage[percentage['correct'] == 1].nsmallest(10, 'percentage') \
                        .sort_values(by = 'percentage', ascending = False).set_index('resort')
percentage_worst10

In [None]:
top10_resorts = percentage_top10.index.values.tolist()
worst10_resorts = percentage_worst10.index.values.tolist()
xticks_labels = [None] * (len(top10_resorts) + len(worst10_resorts))
xticks_labels[::2] = top10_resorts
xticks_labels[1::2] = worst10_resorts

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

fig = plt.figure(figsize = (12, 6))
ax = fig.add_subplot(111)

index = np.linspace(0, 19, 10)
bar_width = 0.6

top10 = plt.bar(index,
                percentage_top10['percentage'],
                bar_width,
                color = '#0099FF',
                ec = 'black',
                linewidth = 0.5)

X = np.linspace(0, 19, 10)
Y = np.array(percentage_top10['percentage'])
Z1 = np.array(percentage_top10['count_per_group'])
Z2 = np.array(percentage_top10['total'])

for a, b, c, d in zip(X, Y, Z1, Z2): 
    plt.text(a + 0.05, b + 3, '[' + str(c) + ' / ' + str(d) + ']', fontsize = 8, ha = 'center', va = 'center')
    
worst10 = plt.bar(index + 1,
                  percentage_worst10['percentage'],
                  bar_width,
                  color = '#FF9933',
                  ec = 'black',
                  linewidth = 0.5)

X = np.linspace(1, 20, 10)
Y = np.array(percentage_worst10['percentage'])
Z1 = np.array(percentage_worst10['count_per_group'])
Z2 = np.array(percentage_worst10['total'])

for a, b, c, d in zip(X, Y, Z1, Z2): 
    plt.text(a + 0.05, b + 3, '[' + str(c) + ' / ' + str(d) + ']', fontsize = 8, ha = 'center', va = 'center')

xticks_major = xticks_major = np.linspace(0, 20, 20)
ax.set_xticks(xticks_major)
ax.set_xticklabels(xticks_labels, fontsize = 10, rotation = 'vertical')

plt.ylim(0, 110)
yticks_major = np.round(np.linspace(0, 100, 11), 10)
yticks_major_str = (yticks_major).astype(int).astype(str).tolist()
yticks_labels = [x + ' %' for x in yticks_major_str]
ax.set_yticks(yticks_major)
ax.set_yticklabels(yticks_labels, fontsize = 10)
ax.yaxis.grid(color = '#333333', alpha = 0.25, zorder = 1)
ax.set_axisbelow(True)

legend = plt.legend([top10, worst10],
                ['ten resorts with the highest accuracy',
                'ten resorts with the lowest accuracy'],
                fontsize = 8,
                loc = 1,
                facecolor = 'white',
                edgecolor = 'black',
                borderaxespad = 1)

plt.suptitle('The ten resorts with the highest / lowest accuracy', fontsize = 14, y = 0.97)
plt.title('(number of correct predictions versus number of samples in the test set in brackets)', fontsize = 10, y = 1.02)

plt.xlabel('')
plt.ylabel('Accuracy', fontsize = 11)
plt.show()

filename = 'results/top10_worst10_final.png'
fig.savefig(filename)