In [0]:
import os
import keras
import pickle
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras import backend as K
from keras.activations import softmax
from keras.applications import VGG19
from keras.applications.vgg19 import preprocess_input
from keras.callbacks import EarlyStopping
from keras.layers import GlobalAveragePooling2D, Dense, Dropout
from keras.models import Model
from keras.models import save_model, load_model
from keras.optimizers import SGD
from PIL import Image
from random import choices
from math import log
from math import ceil
from math import exp
from matplotlib.pyplot import xticks
from zipfile import ZipFile

Using TensorFlow backend.


In [0]:
# Extract files to server
with ZipFile('data.zip', 'r') as dat:
    dat.extractall()

# Open up dictionaries that contain info on paritions and labels
with open('./partition', 'rb') as pickle_in:
    partition = pickle.load(pickle_in)

with open('./labels', 'rb') as pickle_in:
    labels = pickle.load(pickle_in)

# Define global variables
img_size = 256
crop_size = 224
n_classes = 34
n_channels = 3
batch_size = 136
n_epochs = 100    
train_dir = './data/'
data_dir = './data/'

In [0]:
#@title Functions

# -----------------------------------------------------------------------------------------------------------------------------
# BOOSTING FUNCTIONS
# -----------------------------------------------------------------------------------------------------------------------------
def compute_beta(i, Dt_dict, error_array, train_IDs, ensemble_method, n_classes=34, prob_array=None):
    if ensemble_method == 'M1':
        # compute error: error is the sum of distribution probabilities for misclassified samples.
        error = np.sum(np.asarray([Dt_dict['{}'.format(i)][train_IDs[i]] if el == 1 else 0 for j, el in enumerate(error_array)]))
        # abandon if error is bigger than 0.5
        if error > 0.5:
            print('error {} > 0.5 at iteration {}, ABORTING'.format(error, i))
            return
        # set Beta to be error / 1 - error
        return error / (1 - error)

    elif ensemble_method == 'SAMME':
        error_list = []
        for j, el in enumerate(error_array):
            if el == 1:
                error_list.append(Dt_dict['{}'.format(i)][train_IDs[j]])
        error_w = sum(error_list)
        total_w = sum([el for el in list(Dt_dict['{}'.format(i)].values())])
        error = error_w / total_w
        return log((1 - error) / error) + log(n_classes - 1)

    elif ensemble_method == 'CONFADABOOST':
        error = np.sum(np.asarray([(Dt_dict['{}'.format(i)][train_IDs[j]] * max(prob_array[j])) if el == 1 else 0 for j, el in enumerate(error_array)]))
        if error > 0.5:
            print('error {} > 0.5 at iteration {}, ABORTING'.format(error, i))
            return
        return 0.5 * log((1 - error) / error)



def update_Dt(i, Dt_dict, Beta, error_array, method, train_IDs, prob_array=None):
    new_Dt = {}
    if method == 'M1':
        for j, el in enumerate(error_array):
            old_weight = Dt_dict['{}'.format(i)][train_IDs[j]]
            # if this sample is misclassified
            if el == 1:
                new_Dt[train_IDs[j]] = old_weight
            else:
                product = Beta
                new_weight = (old_weight) * product
                # add new weight to the final Dt+1 weight list
                new_Dt[train_IDs[j]] = new_weight

    elif method == 'SAMME':
        for j, el in enumerate(error_array):
            old_weight = Dt_dict['{}'.format(i)][train_IDs[j]]
            if el == 1:
                new_Dt[train_IDs[j]] = old_weight
            else:
                new_weight = old_weight * exp(Beta)
                new_Dt[train_IDs[j]] = new_weight

    elif method == 'CONFADABOOST':
        for j, el in enumerate(error_array):
            old_weight = Dt_dict['{}'.format(i)][train_IDs[j]]
            if el == 1:
                new_Dt[train_IDs[j]] = old_weight
            else:
                new_weight = old_weight * exp(0.5 - Beta * max(prob_array[j]))
                new_Dt[train_IDs[j]] = new_weight

    # normalize new weight dict
    Zt = np.sum(list(new_Dt.values()))
    new_Dt_normalized = dict()
    for item in new_Dt.items():
        new_Dt_normalized[item[0]] = item[1] / Zt
    # new_Dt /= Zt
    return new_Dt

# -----------------------------------------------------------------------------------------------------------------------------
# MODEL INITIALIZATION
# -----------------------------------------------------------------------------------------------------------------------------
def initialize_model(crop_size, n_classes, n_freezed_layers, model='VGG19'):

    base_model = VGG19(weights='imagenet', include_top=False, input_tensor=None,
                             input_shape=(crop_size, crop_size, 3))

    # add a global spatial average pooling layer
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    # add a fully-connected layer
    x = Dense(1024, activation='relu')(x)
    # added a dropout layer to reduce overfitting
    x = Dropout(0.75)(x)
    # add a softmax layer
    predictions = Dense(n_classes, activation='softmax')(x)

    # this is the model we will train
    model = Model(inputs=base_model.input, outputs=predictions)  
    model.load_weights('top_weights.h5')

    for layer in model.layers[:n_freezed_layers]:
        layer.trainable = False
    
#     model.compile(optimizer=SGD(lr=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['acc'])
    model.compile(optimizer=SGD(lr=0.005, momentum=0.9), loss='categorical_crossentropy', metrics=['acc'])
#     model.compile(optimizer=Nadam(lr=0.001), loss='categorical_crossentropy', metrics=['acc'])

    return model

# -----------------------------------------------------------------------------------------------------------------------------
# DATAGENERATOR CLASS
# -----------------------------------------------------------------------------------------------------------------------------
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'

    def __init__(self, list_IDs, labels, data_dir, Dt_dict, i, ensemble_method,
                 batch_size=136, n_channels=3, n_classes=34,
                 shuffle=True, train=True, DisturbLabel=(False, None),
                 crop_size=224, img_size=256):
        'Initialization'
        self.dim = (crop_size, crop_size)
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        self.train = train
        self.DisturbLabel = DisturbLabel[0]
        self.alpha = DisturbLabel[1]
        self.possible_labels = list(set(labels.values()))
        self.crop_size = crop_size
        self.img_size = img_size
        self.data_dir = data_dir
        self.Dt_dict = Dt_dict
        self.i = i
        self.ensemble_method = ensemble_method
        self.count = 0

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        if self.ensemble_method != 'Bagging' and self.train:
            X, y, sample_weight = self.__data_generation(list_IDs_temp)
            return X, y, sample_weight
        else:
            X, y = self.__data_generation(list_IDs_temp)
            return X, y


    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'  # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.crop_size, self.crop_size, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)
        sample_weight = np.empty((self.batch_size), dtype=float)
        # Generate data

        for n, ID in enumerate(list_IDs_temp):
            # Load sample
            img = Image.open('/content/data2/{}.jpg'.format(ID))
            sample = np.asarray(img.convert('RGB'), dtype=int)
#             sample = np.load(self.data_dir + str(ID) + '.npy')

            # RANDOM CROP
            dx = self.crop_size
            dy = self.crop_size
            x_pos = np.random.randint(0, self.img_size - dx + 1)
            y_pos = np.random.randint(0, self.img_size - dy + 1)
            sample = sample[y_pos:(y_pos + dy), x_pos:(x_pos + dx), :]

            if self.train:
              # HORIZONTAL FLIP with 0.5 chance
              if random.randint(0, 1):
                  sample = sample[::-1, :, :]
                  
            # Store sample
            X[n,] = sample
            # Store class
            y[n] = self.labels[ID]
            
            # GET SAMPLE WEIGHTS FOR BOOSTING
            if self.ensemble_method != 'Bagging' and self.train:
                # Store sample_weight
                if self.count == 0:
                  
                    print('using sample distribution {}'.format(self.i), self.Dt_dict['{}'.format(self.i)])
                    print()
                    print()
                    self.count += 1

                min_sample_weight = min(self.Dt_dict['{}'.format(self.i)].values())
                max_sample_weight = max(self.Dt_dict['{}'.format(self.i)].values())
                # sample_weight[n] = self.Dt_dict['{}'.format(self.i)][ID] / float(min_sample_weight)
                sample_weight[n] = self.Dt_dict['{}'.format(self.i)][ID] 
            
        # DisturbLabel       
        if self.DisturbLabel:
            disturbed_labels = []
            for label in y:
                if random.uniform(0, 1) <= self.alpha:
                    disturbed_labels.append(random.choice(self.possible_labels))
                else:
                    disturbed_labels.append(label)
            y = disturbed_labels

        y = [int(el) for el in y]

        if self.ensemble_method != 'Bagging' and self.train:
            return preprocess_input(X), keras.utils.to_categorical(y, num_classes=self.n_classes), (sample_weight / max_sample_weight)
        else:
            return preprocess_input(X), keras.utils.to_categorical(y, num_classes=self.n_classes)

        
# -----------------------------------------------------------------------------------------------------------------------------
# FUNCTION FOR TRAINING INDIVIDUAL CLASSIFIERS
# -----------------------------------------------------------------------------------------------------------------------------

def train(ensemble_method, n_models=10, n_epochs = 50, alpha=0.1):
    # Initialize weight dictionary
    Dt_dict = dict()
    Dt_dict['0'] = dict()
    if ensemble_method == 'SAMME' or ensemble_method == 'CONFADABOOST' or ensemble_method == 'M1':
        for ID in partition['train']:
            Dt_dict['{}'.format(0)][ID] = 1 / len(partition['train'])
    else:
        for ID in partition['train']:
            Dt_dict['{}'.format(0)][ID] = 1
            
            
    with open('Dt_dic-9', 'rb') as pickle_in:
      Dt_dict['9'] = pickle.load(pickle_in)

    # ------------------------------------------------------------------------------------------------------------------
    # TRAINING
    # ------------------------------------------------------------------------------------------------------------------
    val_acc_list = []
    for i in range(n_models):
        # -------------------------------
        # Subset if BAGGING
        # -------------------------------
        if ensemble_method == 'Bagging':
              train_IDs = choices(partition['train'], k=len(partition['train']))
        else:
            train_IDs = partition['train']

        # -------------------------------
        # DEFINE GENERATORS
        # -------------------------------
        if ensemble_method == 'Disturblabel':
          training_generator = DataGenerator(train_IDs, labels,
                                         batch_size=batch_size, ensemble_method=ensemble_method,
                                         shuffle=True,
                                         n_classes=n_classes,
                                         train=True,
                                         DisturbLabel=(True, alpha),
                                         data_dir=data_dir, Dt_dict=Dt_dict, i=i)
        else:
          training_generator = DataGenerator(train_IDs, labels,
                                             batch_size=batch_size, ensemble_method=ensemble_method,
                                             shuffle=True,
                                             n_classes=n_classes,
                                             train=True,
                                             DisturbLabel=(False, 0.1),
                                             data_dir=data_dir, Dt_dict=Dt_dict, i=i)
        
        
     

        validation_generator = DataGenerator(partition['val'], labels, ensemble_method=ensemble_method,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             n_classes=n_classes,
                                             train=False,
                                             data_dir=data_dir, Dt_dict=Dt_dict, i=i)

        boosting_generator = DataGenerator(train_IDs, labels, ensemble_method=ensemble_method,
                                           batch_size=1,
                                           shuffle=False,
                                           n_classes=n_classes,
                                           train=False,
                                           data_dir=data_dir, Dt_dict=Dt_dict, i=i)

        testing_generator = DataGenerator(partition['test'], labels, ensemble_method=ensemble_method,
                                          batch_size=1,
                                          shuffle=False,
                                          n_classes=n_classes,
                                          train=False,
                                          data_dir=data_dir, Dt_dict=Dt_dict, i=i)

        # --------------------------------------------------------------------------------------------------------------
        # TRAIN MODEL AND SAVE IT
        # --------------------------------------------------------------------------------------------------------------
        model = initialize_model(224, 34, 13)

        early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=10)
        hist = model.fit_generator(generator=training_generator,
                                   steps_per_epoch=len(train_IDs) / batch_size,
                                   validation_data=validation_generator,
                                   validation_steps=len(partition['val']) / batch_size,
                                   epochs=n_epochs,
                                   use_multiprocessing=False,
                                   verbose=1,
                                   callbacks=[early_stopping])

        # save model
        if ensemble_method == 'Distrublabel':
          model.save('{}-model'.format(ensemble_method))
        else:
          model.save('{}-model-{}'.format(ensemble_method, i))

        # ----------------------------------
        # TEST PREDICTIONS
        # ----------------------------------
        y_test_prob = model.predict_generator(testing_generator, steps=len(partition['test']), verbose=1)
        np.save('pred_prob-{}-model-{}'.format(ensemble_method, i), y_test_prob)
        # ---------------------------------------------
        # BOOSTING ONLY: UPDATE SAMPLE DISTRIBUTION D_t
        # ---------------------------------------------
        if ensemble_method != 'Bagging' and ensemble_method != 'Disturblabel':
            print('updating weights for {} time'.format(i))
            # Prediction array of (len(X_train), n_classes) with probabilities as values
            y_train_prob = model.predict_generator(boosting_generator, steps=len(train_IDs), verbose=1)
            np.save('train-{}-model-{}', y_train_prob)
            # Transform prediction probabilities to a list of predicted classes
            y_train_pred = [np.argmax(row) for row in y_train_prob]
            y_train_true = [int(labels[ID]) for ID in train_IDs]
            # Create an erray that represents misclassification errors as Booleans
            error_array = np.asarray([0 if pred == y_train_true[i] else 1 for i, pred in enumerate(y_train_pred)])
            # Compute new sample distribution from Betas
            Beta = compute_beta(i, Dt_dict, error_array=error_array, ensemble_method=ensemble_method, n_classes=34, prob_array=y_train_prob, train_IDs=train_IDs)
            new_Dt = update_Dt(i, Dt_dict, Beta, error_array=error_array, method=ensemble_method, prob_array=y_train_prob, train_IDs=train_IDs)
            # Add new sample distribution to distribution dictionairy Dt_dict
            Dt_dict['{}'.format(i + 1)] = new_Dt
            with open('Dt_dic-{}'.format(i+1), 'wb') as pickle_out:
              pickle.dump(new_Dt, pickle_out)
           
    return        

In [0]:
#@title Top layer training
training_generator = DataGenerator(partition['train'], labels,
                                           batch_size=batch_size, ensemble_method=ensemble_method,
                                           shuffle=True,
                                           n_classes=n_classes,
                                           train=True,
                                           DisturbLabel=(False, 0.1),
                                           data_dir=data_dir, Dt_dict=Dt_dict, i=i)

validation_generator = DataGenerator(partition['val'], labels, ensemble_method=ensemble_method,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             n_classes=n_classes,
                                             train=False,
                                             data_dir=data_dir, Dt_dict=Dt_dict, i=i)

model = initialize_model(224, 34, 10)
    

early_stopping = EarlyStopping(monitor='val_loss', patience=15)
hist = model.fit_generator(generator=training_generator,
                                   steps_per_epoch=len(partition['train']) / batch_size,
                                   validation_data=validation_generator,
                                   validation_steps=len(partition['val']) / batch_size,
                                   epochs=20,
                                   use_multiprocessing=False,
                                   verbose=1)

model.save_weights('top_weights.h5')

In [0]:
# Bagging
train(ensemble_method='Bagging', n_epochs=100)

# Boosting 
train(ensemble_method='M1', n_epochs=100)
train(ensemble_method='Disturblabel', alpha=0.1, n_models = 1, n_epochs=100)
train(ensemble_method='Disturblabel', alpha=0.2, n_models = 1, n_epochs=100)
train(ensemble_method='CONFADABOOST', n_epochs=100)