# Dropout


In [1]:
import sys
sys.path.append('../..')

In [2]:
import d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

  from ._conv import register_converters as _register_converters


## Dropout from Scratch

In [3]:
def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    # In this case, all elements are dropped out.
    if drop_prob == 1:
        return X.zeros_like()
    mask = nd.random.uniform(0, 1, X.shape) > drop_prob
    return mask * X / (1.0-drop_prob)

## Sanity Test 

In [4]:
X = nd.arange(16).reshape((2, 8))
print(dropout(X, 0))
print(dropout(X, 0.5))
print(dropout(X, 1))


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

[[ 0.  0.  0.  0.  8. 10. 12.  0.]
 [16.  0. 20. 22.  0.  0.  0. 30.]]
<NDArray 2x8 @cpu(0)>

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>


### Defining Model Parameters


In [5]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 1024, 2048

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

### Define the Model

In [6]:
drop_prob1, drop_prob2 = 0.0, 0.0

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training():        # Use dropout only when training the model.
        H1 = dropout(H1, drop_prob1)  # Add a dropout layer after the first fully connected layer.
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2)  # Add a dropout layer after the second fully connected layer.
    return nd.dot(H2, W3) + b3

### Training and Testing

In [7]:
num_epochs, lr, batch_size = 10, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 0.9428, train acc 0.650, test acc 0.772
epoch 2, loss 0.5514, train acc 0.798, test acc 0.843
epoch 3, loss 0.4427, train acc 0.837, test acc 0.864
epoch 4, loss 0.3942, train acc 0.853, test acc 0.863
epoch 5, loss 0.3679, train acc 0.863, test acc 0.877
epoch 6, loss 0.3464, train acc 0.872, test acc 0.878
epoch 7, loss 0.3286, train acc 0.879, test acc 0.875
epoch 8, loss 0.3146, train acc 0.882, test acc 0.882
epoch 9, loss 0.3008, train acc 0.888, test acc 0.884
epoch 10, loss 0.2933, train acc 0.890, test acc 0.885


## Dropout in Gluon

In [7]:
net = nn.Sequential()
net.add(nn.Dense(num_hiddens1, activation="relu"),
        nn.Dropout(drop_prob1),  # Add a dropout layer after the first fully connected layer.
        nn.Dense(num_hiddens2, activation="relu"),
        nn.Dropout(drop_prob2),  # Add a dropout layer after the second fully connected layer.
        nn.Dense(num_outputs))
net.initialize(init.Normal(sigma=0.01))

### Training

In [8]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
             None, None, trainer)

epoch 1, loss 0.9580, train acc 0.644, test acc 0.808
epoch 2, loss 0.5070, train acc 0.810, test acc 0.852
epoch 3, loss 0.4382, train acc 0.837, test acc 0.854
epoch 4, loss 0.3925, train acc 0.855, test acc 0.861
epoch 5, loss 0.3682, train acc 0.864, test acc 0.875
epoch 6, loss 0.3460, train acc 0.871, test acc 0.866
epoch 7, loss 0.3437, train acc 0.873, test acc 0.880
epoch 8, loss 0.3158, train acc 0.883, test acc 0.882
epoch 9, loss 0.3032, train acc 0.888, test acc 0.878
epoch 10, loss 0.2927, train acc 0.890, test acc 0.888
