In [None]:
import numpy as np
import pandas as pd
import os
from os.path import isfile, join
from os import listdir
from shutil import copy
import itertools 
import operator
from PIL import Image
from scipy.misc import imresize

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classes_list = ['Animals',
                'Lifts',
                'Other',
                'People',
                'Summer activity',
                'Summer landscape',
                'Winter activity',
                'Winter landscape']

# training data
        
train = []

for i in range(0, len(classes_list)):
    images_class = []
    path = 'image_classifier/training_data/' + str(classes_list[i])
    images_class = [[image, int(image[0:-8]), i, imresize(np.array(Image.open(path + '/' + image)), (224, 224)).reshape(1, -1)[0]] for image in listdir(path) if isfile(join(path, image))]
    train = train + images_class

X_train = np.asarray([x[3] for x in train])
Y_train = np.asarray([x[2] for x in train])

del images_class
del train

# validation data

validation = []

for i in range(0, len(classes_list)):
    images_class = []
    path = 'image_classifier/validation_data/' + str(classes_list[i])
    images_class = [[image, int(image[0:-8]), i, imresize(np.array(Image.open(path + '/' + image)), (224, 224)).reshape(1, -1)[0]] for image in listdir(path) if isfile(join(path, image))]
    validation = validation + images_class

X_val = np.asarray([x[3] for x in validation])
Y_val = np.asarray([x[2] for x in validation])

del images_class
del validation

# test data

test = []

for i in range(0, len(classes_list)):
    images_class = []
    path = 'image_classifier/test_data/' + str(classes_list[i])
    images_class = [[image, int(image[0:-8]), i, imresize(np.array(Image.open(path + '/' + image)), (224, 224)).reshape(1, -1)[0]] for image in listdir(path) if isfile(join(path, image))]
    test = test + images_class
    
X_test = np.asarray([x[3] for x in test])
Y_test = np.asarray([x[2] for x in test])

del images_class
del test

In [None]:
X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape

In [None]:
224 * 224 * 3

In [None]:
# use a standardscaler with the partial_fit method to make sure the features have a mean of zero
# and standard deviation of one

scaler = StandardScaler()

n = X_train.shape[0]
batch_size = 250
index = 0

# partially fit the scaler on every 1000 samples and update the scaler
while index < n:
    partial_size = min(batch_size, n - index)
    partial_x = X_train[index: index + partial_size]
    scaler.partial_fit(partial_x)
    index += partial_size

In [None]:
# transform the training, validation and test set with the standardscaler previously fitted

X_train_scaled = np.array([], dtype = 'float32')

n = X_train.shape[0]
batch_size = 250
index = 0

# partially transform the data based on the scaler and concatenate the data
while index < n:
    partial_size = min(batch_size, n - index)
    partial_x = X_train[index: index + partial_size]
    partial_x = scaler.transform(partial_x).astype('float32')
    X_train_scaled = np.vstack([X_train_scaled, partial_x]) if X_train_scaled.size else partial_x
    index += partial_size

# transform the validation set
    
X_val_scaled = np.array([], dtype = 'float32')

n = X_val.shape[0]
batch_size = 250
index = 0

# partially transform the data based on the scaler and concatenate the data
while index < n:
    partial_size = min(batch_size, n - index)
    partial_x = X_val[index: index + partial_size]
    partial_x = scaler.transform(partial_x).astype('float32')
    X_val_scaled = np.vstack([X_val_scaled, partial_x]) if X_val_scaled.size else partial_x
    index += partial_size

# transform the test set

X_test_scaled = np.array([], dtype = 'float32')

n = X_test.shape[0]
batch_size = 250
index = 0

# partially transform the data based on the scaler and concatenate the data
while index < n:
    partial_size = min(batch_size, n - index)
    partial_x = X_test[index: index + partial_size]
    partial_x = scaler.transform(partial_x).astype('float32')
    X_test_scaled = np.vstack([X_test_scaled, partial_x]) if X_test_scaled.size else partial_x
    index += partial_size

In [None]:
# let's check the dimensions of the datasets and check if the means are close to zero and the standard deviations close
# to one for each feature (to simplify: the sum of all means should equal zero en the sum of all standard deviations should
# equal the number of features)

print('Information about the training set:\n')
print('   - datatype:', X_train_scaled.dtype)
print('   - shape of the dataset:', X_train_scaled.shape)
print('   - sum of the means of the columns:', round(X_train_scaled.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(X_train_scaled.std(axis = 0).sum(), 2))
print('\n')
print('Information about the validation set:\n')
print('   - datatype:', X_val_scaled.dtype)
print('   - shape of the dataset:', X_val_scaled.shape)
print('   - sum of the means of the columns:', round(X_val_scaled.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(X_val_scaled.std(axis = 0).sum(), 2))
print('\n')
print('Information about the test set:\n')
print('   - datatype:', X_test_scaled.dtype)
print('   - shape of the dataset:', X_test_scaled.shape)
print('   - sum of the means of the columns:', round(X_test_scaled.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(X_test_scaled.std(axis = 0).sum(), 2))

In [None]:
# let's use incremental PCA to fit the data and return 100 principal components for visualization purposes,
# eventually I'd like to select the principal components that explain 80% of the variance of the features

ipca = IncrementalPCA(n_components = 100, whiten = True, batch_size = 100)

ipca.fit(X_train_scaled)

In [None]:
components = np.arange(0, 101)
cum_var = np.append(0, np.cumsum(ipca.explained_variance_ratio_))

In [None]:
import matplotlib.pyplot as plt

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

plt.figure()
fig = plt.gcf()

fig.set_size_inches(10, 7)

threshold = np.full((101,), 0.8)

plt.title('\nExplained variance by n principal components\n', fontsize = 14)
plt.plot(components, cum_var, color = '#FF9933', linewidth = 0.75, linestyle = '-')
plt.plot(components, threshold, color = 'red', linewidth = 1, linestyle = '--')

ax = plt.gca()
ax.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)

plt.xlim(0, 100)

xticks_major = np.linspace(0, 100, 11).astype('int16')
ax.set_xticks(xticks_major)
ax.set_xticklabels(xticks_major, fontsize = 11)

plt.ylim(0, 1)
yticks_major = np.round(np.linspace(0, 1, 11), 1)
yticks_major_str = (yticks_major * 100).astype(int).astype(str).tolist()
yticks_labels = [x + ' %' for x in yticks_major_str]
ax.set_yticks(yticks_major)
ax.set_yticklabels(yticks_labels, fontsize = 11)

ax.set_xlabel('Number of principal components', fontsize = 11, labelpad = 10)
ax.set_ylabel('% variance of the original features explained', fontsize = 11)
ax.set_axisbelow(True)

plt.text(67, 0.7,
         r'67 principal components explain'"\n"r'80% of the variance of the features', fontsize = 10, multialignment = 'center',
         bbox = dict(boxstyle = 'round4', facecolor = 'white', alpha = 0.5))

plt.annotate("",
             xy = (66, 0.8),
             xytext = (67.75, 0.76),
             arrowprops = dict(arrowstyle = "simple", facecolor = "black"))

plt.show()

filename = 'results/IPCA_explained_variance.png'  
fig.savefig(filename)

In [None]:
# apply IPCA transformation on the training, validation and test set with
# 36 principal components (they explain 80% of the variance of the features)

ipca = IncrementalPCA(n_components = 67, whiten = True, batch_size = 1000)

ipca.fit(X_train_scaled)

print('The variance of the features explained by the 36 principal components is:', "{0:.2f}%".format(sum(ipca.explained_variance_ratio_) * 100))

ipca_train = ipca.transform(X_train_scaled)
ipca_val = ipca.transform(X_val_scaled)
ipca_test = ipca.transform(X_test_scaled)

In [None]:
# let's check what happened to the means and standard deviations

print('Information about the training set:\n')
print('   - datatype:', ipca_train.dtype)
print('   - shape of the dataset:', ipca_train.shape)
print('   - sum of the means of the columns:', round(ipca_train.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(ipca_train.std(axis = 0).sum(), 2))
print('\n')
print('Information about the validation set:\n')
print('   - datatype:', ipca_val.dtype)
print('   - shape of the dataset:', ipca_val.shape)
print('   - sum of the means of the columns:', round(ipca_val.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(ipca_val.std(axis = 0).sum(), 2))
print('\n')
print('Information about the test set:\n')
print('   - datatype:', ipca_test.dtype)
print('   - shape of the dataset:', ipca_test.shape)
print('   - sum of the means of the columns:', round(ipca_test.mean(axis = 0).sum(), 2))
print('   - sum of the standard deviations of the columns:', round(ipca_test.std(axis = 0).sum(), 2))

In [None]:
# perform logistic regression with the principal components to classify the pictures into 5 categories

# available optimization methods for multinomial logistic regression:
# - newton-cg
# - sag
# - saga
# - lbfgs

class_weights = {0: 5.6900,
                 1: 1.9965,
                 2: 1.5850,
                 3: 2.6343,
                 4: 3.2330,
                 5: 1.6119,
                 6: 0.6040,
                 7: 1.0000}

clf = LogisticRegression(multi_class = 'multinomial',
                         solver = 'newton-cg',
                         C = 1,
                         max_iter = 100000,
                         class_weight = class_weights,
                         random_state = 0).fit(ipca_train, Y_train)

# accuracy for the training set
Y_train_pred = clf.predict(ipca_train)
accuracy_train = accuracy_score(Y_train, Y_train_pred)

# accuracy for the validation set
Y_val_pred = clf.predict(ipca_val)
accuracy_validation = accuracy_score(Y_val, Y_val_pred)

print('Accuracy on the training set:', "{0:.2f}".format(accuracy_train))
print('Accuracy on the validation set:', "{0:.2f}".format(accuracy_validation))

In [None]:
from sklearn.svm import SVC

class_weights = {0: 5.6900,
                 1: 1.9965,
                 2: 1.5850,
                 3: 2.6343,
                 4: 3.2330,
                 5: 1.6119,
                 6: 0.6040,
                 7: 1.0000}

clf = SVC(kernel = 'rbf', C = 0.01, gamma = 'auto', decision_function_shape = 'multi', class_weight = class_weights).fit(ipca_train, Y_train)

# accuracy for the training set
Y_train_pred = clf.predict(ipca_train)
accuracy_train = accuracy_score(Y_train, Y_train_pred)

# accuracy for the validation set
Y_val_pred = clf.predict(ipca_val)
accuracy_validation = accuracy_score(Y_val, Y_val_pred)

print('Accuracy on the training set:', "{0:.2f}".format(accuracy_train))
print('Accuracy on the validation set:', "{0:.2f}".format(accuracy_validation))