In [1]:
import numpy as np
import pandas as pd
from Utilities.evaluation_utils import *
from itertools import combinations
from DataPreparation.dataset_preparation import get_catsvsdogs_dataset
import os
from Utilities.evaluation_utils import *

%matplotlib inline

In [2]:
data_dir = 'Dataset/'
validation_split = 0.2
split_seed = 6135

In [3]:
X_train, y_train, X_val, y_val, X_test, X_train_moments = get_catsvsdogs_dataset(data_dir,
                                        validation_split, split_seed, normalize_train=True)
mean_img, std_img = X_train_moments
print('Train data size: ', X_train.shape)
print('Train labels size: ', y_train.shape)
print('Val data size: ', X_val.shape)
print('Val labels size: ', y_val.shape)
print('Test data size: ', X_test.shape)

Train data size:  (15999, 64, 64, 3)
Train labels size:  (15999,)
Val data size:  (3999, 64, 64, 3)
Val labels size:  (3999,)
Test data size:  (4999, 64, 64, 3)


In [20]:
PATH = 'CSV/'
file_names = os.listdir(f'{PATH}')
file_names = [file_name for file_name in file_names if 'val' in file_name]
model_probabilities = {}
for file_name in file_names:
    df = pd.read_csv(f'{PATH}{file_name}', usecols=[1,2])
#     df = df.apply(pd.to_numeric)
    model_probabilities[file_name] = df.values

In [21]:
print(model_probabilities[file_names[0]][:5])

[[9.98989143e-01 1.01085663e-03]
 [9.98231291e-01 1.76870903e-03]
 [8.83388362e-04 9.99116612e-01]
 [8.71072195e-04 9.99128928e-01]
 [4.58009915e-04 9.99541990e-01]]


In [23]:
combinations_2 = list(combinations(file_names,2))
combinations_3 = list(combinations(file_names,3))
combinations_5 = list(combinations(file_names,5))
combinations_7 = list(combinations(file_names,7))

max_acc = 0
for combination in combinations_2:
    avg_result = np.zeros((len(y_val), 2))
    for file_name in combination:
        avg_result += model_probabilities[file_name]
    ensemble_acc = accuracy(avg_result, y_val)
    if ensemble_acc > max_acc:
        best_combination = combination
        max_acc = ensemble_acc

print('Ensemble 2 best result:')
print(max_acc)
print(best_combination)

Ensemble 2 best result:
0.9562390597649413
('B_VGG19_probs_val_55000.csv', 'E_Wide28_10_probs_val_45000.csv')


In [32]:
model_preds = model_probabilities.copy()
for k, v in model_preds.items():
    preds = np.zeros_like(v)
    preds[np.arange(len(v)), v.argmax(1)] = 1
    model_preds[k] = preds

In [36]:
def ensemble_eval(model_probabilities, labels, n, preds=True, rand_per_combination=50):
    models = list(model_probabilities.keys())
    combinations_ = list(combinations(models, n))
    best_acc = 0
    best_randoms = None
    for combination in combinations_:
        if rand_per_combination is None:
            ensemble_logits = np.zeros((len(labels), 2))
            for model in combination:
                if preds is False:
                    ensemble_logits += model_probabilities[model]
                else:
                    ensemble_logits += model_preds[model]
            ensemble_acc = accuracy(ensemble_logits, labels)
            if ensemble_acc > best_acc:
                best_combination = combination
                best_acc = ensemble_acc
        else:
            for _ in range(rand_per_combination):
                ensemble_logits = np.zeros((len(labels), 2))
                randoms = np.random.uniform(0,1,len(combinations_[0]))
                for i, model in enumerate(combination):
                    if preds is False:
                        ensemble_logits += randoms[i] * model_probabilities[model]
                    else:
                        ensemble_logits += randoms[i] * model_preds[model]
                ensemble_acc = accuracy(ensemble_logits, labels)
                if ensemble_acc > best_acc:
                    best_combination = combination
                    best_acc = ensemble_acc
    return best_acc, best_combination, best_randoms

# best_acc_2, best_combination_2 = ensemble_eval(model_probabilities, y_val, 2)
# best_acc_3, best_combination_3 = ensemble_eval(model_probabilities, y_val, 3)
# best_acc_5, best_combination_5 = ensemble_eval(model_probabilities, y_val, 5)
best_acc_7, best_combination_7, best_randoms_7 = ensemble_eval(model_probabilities, y_val, 7)
best_acc_10, best_combination_10, best_randoms_10 = ensemble_eval(model_probabilities, y_val, 10)
# print('Ensemble 2 best result:')
# print(best_acc_2)
# print(best_combination_2)
# print('Ensemble 3 best result:')
# print(best_acc_3)
# print(best_combination_3)
# print('Ensemble 5 best result:')
# print(best_acc_5)
# print(best_combination_5)
print('Ensemble 7 best result:')
print(best_acc_7)
print(best_randoms_7)
print(best_combination_7)
print('Ensemble 10 best result:')
print(best_acc_10)
print(best_randoms_10)
print(best_combination_10)

Ensemble 2 best result:
0.9534883720930233
('A_VGG19_probs_val_65000.csv', 'E_Wide28_10_probs_val_40000.csv')
Ensemble 3 best result:
0.9574893723430857
('B_VGG19_probs_val_55000.csv', 'E_Wide28_10_probs_val_40000.csv', 'E_Wide28_10_probs_val_50000.csv')
Ensemble 5 best result:
0.9597399349837459
('A_VGG19_probs_val_70000.csv', 'C_VGG19_probs_val_60000.csv', 'D_Wide28_10_probs_val_25000.csv', 'D_Wide28_10_probs_val_35000.csv', 'E_Wide28_10_probs_val_40000.csv')
Ensemble 7 best result:
0.9602400600150037
('A_VGG19_probs_val_80000.csv', 'B_VGG19_probs_val_40000.csv', 'B_VGG19_probs_val_55000.csv', 'C_VGG19_probs_val_55000.csv', 'D_Wide28_10_probs_val_35000.csv', 'E_Wide28_10_probs_val_40000.csv', 'E_Wide28_10_probs_val_55000.csv')
Ensemble 10 best result:
0.9597399349837459
('A_VGG19_probs_val_70000.csv', 'A_VGG19_probs_val_80000.csv', 'B_VGG19_probs_val_55000.csv', 'C_VGG19_probs_val_55000.csv', 'C_VGG19_probs_val_60000.csv', 'D_Wide28_10_probs_val_25000.csv', 'E_Wide28_10_probs_val_400