In [1]:
import os

# Why Torch? You'll find the answer in the .md files! 
os.environ["KERAS_BACKEND"] = "torch"

import keras

from dataset.k_fold_dataset_wrapper import KFoldDatasetWrapper
from dataset.dataset_loader import dataset_loader
import keras_tuner
from torch.utils.data import DataLoader
from models.naive_dnn_gen.naive_dnn import NaiveDnnTunableWrapper
from models.structure.tunable_model_family_hypermodel import TunableModelFamilyHypermodel
from torch.utils.data import DataLoader
import pandas

In [27]:
# Initial steps
hyperparameters = keras_tuner.HyperParameters()
train, test = dataset_loader((128, 128), is_grayscale=False)
dataset_split_controller = KFoldDatasetWrapper(5)
dataset_split_controller.load_data(train)

local_train, validation = dataset_split_controller.get_data_for_fold(0)
train_dataloader = DataLoader(dataset=local_train, batch_size=16, shuffle=True)
validation_dataloader = DataLoader(dataset=validation, batch_size=16, shuffle=True)

# First Space Search
We expect the process to not yield good results as the DNN requires a huge number of neurons to work properly. 

After analizing the results we may consider a second search that is restricted to a smaller set of possible parameters

In [40]:
from keras_tuner import BayesianOptimization
from utils.my_tuner import HistoryDeletingBayesianOptimization

hypermodel = TunableModelFamilyHypermodel((3, 128, 128), NaiveDnnTunableWrapper())

# Our model uses SGD
hyperparameters.Fixed("lr", 1e-4)
hyperparameters.Fixed("momentum", 0.9)

hyperparameters.Fixed("layers", 3)
# I expect the tuner to find better models without dropout as dropout learning takes usually empircally longer.
# We hope to find at least one structure
tuner = HistoryDeletingBayesianOptimization(
    hypermodel,
    hyperparameters=hyperparameters,
    objective='val_loss',
    tune_new_entries=True,
    overwrite=False,
    directory="dnn-search",
    max_trials=15,
    project_name="three-layers"
)

In [42]:
# todo override on_epoch_end to write info of it
tuner.search(train_dataloader, epochs=22, validation_data=validation_dataloader,
             callbacks=[keras.callbacks.CSVLogger("dnn-search/two-layers/search.log", separator=",", append=True)])

In [44]:
tuner.results_summary(5)  # Top 5
tuner.get_best_hyperparameters(5)


In [31]:
# The top 5 behave kinda bad actually but let's try to train them via Early Stopping for a longer time and see how good they generalize.
# We try with a model that has dropout which typically requires more time to converge
train_model = NaiveDnnTunableWrapper()
train_model.load_parameters(tuner.get_best_hyperparameters(5)[3])

train_dataloader = DataLoader(dataset=train, batch_size=16, shuffle=True)
validation_dataloader = DataLoader(dataset=test, batch_size=16, shuffle=True)

fine_tune_model = train_model.make_model((3, 128, 128))
train_model.compile_model(fine_tune_model, keras.optimizers.SGD(learning_rate=1e-4, momentum=0.9, nesterov=True))

model_history = fine_tune_model.fit(
    train_dataloader, validation_data=validation_dataloader, epochs=100, callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='min')
    ]
)


In [38]:
import pandas
metrics_df1 = pandas.DataFrame(model_history.history)
metrics_df1[["loss", "val_loss"]].plot(ylim=(0, 1))
metrics_df1[["accuracy", "val_accuracy"]].plot(ylim=(0, 1))

In [39]:
metrics_df1

# Considerations
The results make me believe that the network has a hard time generalizing the data. The learning process might be too slow to see if the parameters we found are good enough. We should try to tune the lr of SGD. 

I will either way try to train the 3 best networks to see if (via early stopping) the convergence is met with a good aproximation.
What I expect to see is that (maybe outisde the first one that will do a little better) all the networks will yeild a low accuracy on train and test (around 80%/70%).

To try this we use K-Fold Cross Validation

In [None]:
from models.k_fold_cv_procedure import k_fold_cv_procedure
from models.naive_dnn_gen.naive_dnn import HiddenLayerStructure, NaiveDnnWrapper

train, _ = dataset_loader((128, 128), is_grayscale=False)
k_fold_controller = KFoldDatasetWrapper(5)
k_fold_controller.load_data(train)

model_generator = NaiveDnnWrapper()
model_generator.hidden_layers = [
    HiddenLayerStructure(2048, None),
    HiddenLayerStructure(1024, None),
]

history = k_fold_cv_procedure(model_generator, (3, 128, 128), 'SGD', k_fold_controller)

In [2]:
import gc
import torch
from models.naive_dnn_gen.naive_dnn import HiddenLayerStructure, NaiveDnnWrapper

train, test = dataset_loader((128, 128), is_grayscale=False)

k_fold_controller = KFoldController(5)
k_fold_controller.load_data(train)

local_train, validation = k_fold_controller.get_data_for_fold(0)
train_dataloader = DataLoader(dataset=local_train, batch_size=16, shuffle=True)
validation_dataloader = DataLoader(dataset=validation, batch_size=16, shuffle=True)

model_generator = NaiveDnnWrapper()
model_generator.hidden_layers = [
    HiddenLayerStructure(2048, None),
    HiddenLayerStructure(1024, None),
]

history = []

for i in range(k_fold_controller.k):
    # Release memory to avoid OOM during tuning.
    torch.cuda.empty_cache()
    gc.collect()

    train_i, validation_i = k_fold_controller.get_data_for_fold(i)

    train_dataloader = DataLoader(dataset=train_i, batch_size=16, shuffle=True)
    validation_dataloader = DataLoader(dataset=validation_i, batch_size=16, shuffle=True)

    model_i = model_generator.make_model((3, 128, 128))
    model_generator.compile_model(model_i, 'SGD')
    i_history = model_i.fit(
        train_dataloader, validation_data=validation_dataloader, epochs=100,
        callbacks=[
            keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='min')]
    )

    history.append(i_history)


In [4]:
import numpy as np

expected_val_loss = np.sum([history[i].history['val_loss'][-1] for i in range(len(history))]) / len(history)
expected_val_acc = np.sum([history[i].history['val_accuracy'][-1] for i in range(len(history))]) / len(history)

# Via early stopping and parameter restoring
min_val_loss_i = [np.argmin(history[i].history['val_loss']) for i in range(len(history))]
best_validation_loss_values = [history[index].history['val_loss'][item] for index, item in enumerate(min_val_loss_i)]
best_validation_acc_values = [history[index].history['val_accuracy'][item] for index, item in enumerate(min_val_loss_i)]
best_expected_val_loss = np.sum(best_validation_loss_values) / len(min_val_loss_i)
best_expected_val_accuracy = np.sum(best_validation_acc_values) / len(min_val_loss_i)

print(
    f"We expect the model to perform with a {expected_val_loss} loss and a {expected_val_acc} accuracy by normal training. \n"
    f"If we consider to restore the best weights we expect a model to perform with: {best_expected_val_loss} loss and {best_expected_val_accuracy} accuracy")

# The model under performs and slowly converges for training but can't generalize for the validation data. It seems like we are underfitting.
# Patience might be set too low. We higher it to 10 for the next test as suggested: https://stats.stackexchange.com/questions/231061/how-to-use-early-stopping-properly-for-training-deep-neural-network
# http://users.diag.uniroma1.it/~palagi/didattica/sites/default/files/allegati/OMML_12th_lect_19-20_early%20stopping.pdf

In [5]:
metrics_df1 = pandas.DataFrame(history[0].history)
metrics_df1[["loss", "val_loss"]].plot(ylim=(0, 1))
metrics_df1[["accuracy", "val_accuracy"]].plot(ylim=(0, 1))
metrics_df1[["loss", "val_loss"]].plot()
metrics_df1[["accuracy", "val_accuracy"]].plot()

In [10]:
metrics_df1

In [7]:
train_dataloader = DataLoader(dataset=train, batch_size=16, shuffle=True)
validation_dataloader = DataLoader(dataset=test, batch_size=16, shuffle=True)

model_generator = NaiveDnnWrapper()
model_generator.hidden_layers = [
    HiddenLayerStructure(550, None),
    HiddenLayerStructure(550, None),
]

end_model = model_generator.make_model((3, 128, 128))
model_generator.compile_model(end_model, 'SGD')
history = end_model.fit(train_dataloader, validation_data=validation_dataloader, epochs=100,
                        callbacks=[keras.callbacks.EarlyStopping(
                            monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='min'
                        )])

In [9]:
import numpy as np

best_trial_index = np.argmin(history.history['val_loss'])
history.history['val_loss'][best_trial_index]

print(
    f"We expect the model to perform with a {best_expected_val_loss} loss and a {best_expected_val_accuracy} accuracy with early stopping and observed. \n"
    f"a real performance on the full set of with: {history.history['val_loss'][best_trial_index]} loss and {history.history['val_accuracy'][best_trial_index]} accuracy"
)

# Our model is statistically coherent

In [16]:
a = 0
tuner.results_summary()  # Ten best
tuner.get_best_hyperparameters(5)