# Understanding the difficulty of training deep feedforward neural networks
par Xavier Glorot et Yoshua Bengio (2010)

## Imports

In [2]:
%load_ext autoreload
%autoreload 2
from itertools import chain
from pathlib import Path

### Deep Learner

In [None]:
from deep_learner.datasets import cifar10, mnist
from deep_learner.nn import CrossEntropyLoss, Linear, Sequential, Sigmoid, Softmax


### Datasets

In [7]:
CIFAR10_DATASET = cifar10()
MNIST_DATASET = mnist()


## Models

In [30]:
models = {}

for num_hidden in range(1, 6):
    models[f"model_sig_{num_hidden}"] = Sequential(
    Linear(128, 1_000),
    Sigmoid(),
    *list(chain.from_iterable(
        (Linear(1_000, 1_000), Sigmoid()) for _ in range(num_hidden - 1)),
    ),
    Linear(1_000, 10),
    Softmax()
    )

print(models)

{'model_sig_1': Sequential(
    (0): Linear(n_in=128, n_out=1000)
    (1): Sigmoid()
    (2): Linear(n_in=1000, n_out=10)
    (3): Softmax()
), 'model_sig_2': Sequential(
    (0): Linear(n_in=128, n_out=1000)
    (1): Sigmoid()
    (2): Linear(n_in=1000, n_out=1000)
    (3): Sigmoid()
    (4): Linear(n_in=1000, n_out=10)
    (5): Softmax()
), 'model_sig_3': Sequential(
    (0): Linear(n_in=128, n_out=1000)
    (1): Sigmoid()
    (2): Linear(n_in=1000, n_out=1000)
    (3): Sigmoid()
    (4): Linear(n_in=1000, n_out=1000)
    (5): Sigmoid()
    (6): Linear(n_in=1000, n_out=10)
    (7): Softmax()
), 'model_sig_4': Sequential(
    (0): Linear(n_in=128, n_out=1000)
    (1): Sigmoid()
    (2): Linear(n_in=1000, n_out=1000)
    (3): Sigmoid()
    (4): Linear(n_in=1000, n_out=1000)
    (5): Sigmoid()
    (6): Linear(n_in=1000, n_out=1000)
    (7): Sigmoid()
    (8): Linear(n_in=1000, n_out=10)
    (9): Softmax()
), 'model_sig_5': Sequential(
    (0): Linear(n_in=128, n_out=1000)
    (1): Sigmo

## Hyperparameters

In [None]:
# In the paper, the procedure to tune the learning rate is not explicited, although it
# appears to be some kind of search validated through a validation set,
# therefore we simply set it to 10^-3 for the moment
LEARNING_RATE: float = 1e-3
MINI_BATCH_SIZE: int = 10
EPOCHS: int = 100