In [1]:
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

def dropout(X, drop_prob):
    assert 0<= drop_prob <= 1
    keep_prob = 1 - drop_prob
    
    if keep_prob == 0:
        return X.zeros_like()
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob
    return mask * X / keep_prob

In [2]:
X = nd.arange(16).reshape((2,8))
dropout(X, 0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [3]:
dropout(X, 0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X, 1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

In [6]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

In [7]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training():
        H1 = dropout(H1, drop_prob1)
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2)
    return nd.dot(H2, W3) + b3

In [8]:
num_epochs, lr, batch_size = 5, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params, lr)

epoch 1, loss 1.1059, train acc 0.571, test acc 0.782
epoch 2, loss 0.5767, train acc 0.785, test acc 0.839
epoch 3, loss 0.4887, train acc 0.822, test acc 0.851
epoch 4, loss 0.4511, train acc 0.835, test acc 0.857
epoch 5, loss 0.4180, train acc 0.848, test acc 0.863


In [15]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'),
        nn.Dropout(drop_prob1),
        nn.Dense(256, activation='relu'),
        nn.Dropout(drop_prob2),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [12]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)

epoch 1, loss 1.1925, train acc 0.538, test acc 0.771
epoch 2, loss 0.5894, train acc 0.782, test acc 0.832
epoch 3, loss 0.4972, train acc 0.819, test acc 0.833
epoch 4, loss 0.4553, train acc 0.835, test acc 0.860
epoch 5, loss 0.4196, train acc 0.848, test acc 0.864


In [14]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)

epoch 1, loss 1.1593, train acc 0.550, test acc 0.785
epoch 2, loss 0.6027, train acc 0.772, test acc 0.832
epoch 3, loss 0.5193, train acc 0.808, test acc 0.841
epoch 4, loss 0.4742, train acc 0.825, test acc 0.842
epoch 5, loss 0.4505, train acc 0.834, test acc 0.861


In [16]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 20, batch_size, None,
              None, trainer)

epoch 1, loss 1.1088, train acc 0.569, test acc 0.765
epoch 2, loss 0.5707, train acc 0.786, test acc 0.846
epoch 3, loss 0.4866, train acc 0.821, test acc 0.846
epoch 4, loss 0.4407, train acc 0.839, test acc 0.865
epoch 5, loss 0.4252, train acc 0.846, test acc 0.866
epoch 6, loss 0.3938, train acc 0.856, test acc 0.871
epoch 7, loss 0.3739, train acc 0.864, test acc 0.872
epoch 8, loss 0.3651, train acc 0.867, test acc 0.869
epoch 9, loss 0.3502, train acc 0.872, test acc 0.878
epoch 10, loss 0.3429, train acc 0.875, test acc 0.883
epoch 11, loss 0.3333, train acc 0.877, test acc 0.881
epoch 12, loss 0.3280, train acc 0.880, test acc 0.882
epoch 13, loss 0.3199, train acc 0.883, test acc 0.884
epoch 14, loss 0.3137, train acc 0.884, test acc 0.884
epoch 15, loss 0.3061, train acc 0.886, test acc 0.892
epoch 16, loss 0.2970, train acc 0.890, test acc 0.890
epoch 17, loss 0.2973, train acc 0.889, test acc 0.886
epoch 18, loss 0.2968, train acc 0.890, test acc 0.883
epoch 19, loss 0.28

In [17]:
net2 = nn.Sequential()
net2.add(nn.Dense(256, activation='relu'),
        #nn.Dropout(drop_prob1),
        nn.Dense(256, activation='relu'),
        #nn.Dropout(drop_prob2),
        nn.Dense(10))
net2.initialize(init.Normal(sigma=0.01))

In [18]:
trainer = gluon.Trainer(net2.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net2, train_iter, test_iter, loss, 20, batch_size, None,
              None, trainer)

epoch 1, loss 1.1491, train acc 0.557, test acc 0.780
epoch 2, loss 0.5418, train acc 0.793, test acc 0.830
epoch 3, loss 1.3126, train acc 0.612, test acc 0.745
epoch 4, loss 0.5870, train acc 0.775, test acc 0.823
epoch 5, loss 0.4746, train acc 0.824, test acc 0.844
epoch 6, loss 0.4312, train acc 0.840, test acc 0.857
epoch 7, loss 0.4073, train acc 0.849, test acc 0.856
epoch 8, loss 0.3919, train acc 0.854, test acc 0.862
epoch 9, loss 0.3913, train acc 0.856, test acc 0.866
epoch 10, loss 0.3682, train acc 0.863, test acc 0.862
epoch 11, loss 0.3622, train acc 0.865, test acc 0.874
epoch 12, loss 0.3554, train acc 0.868, test acc 0.866
epoch 13, loss 0.3422, train acc 0.872, test acc 0.863
epoch 14, loss 0.3379, train acc 0.875, test acc 0.872
epoch 15, loss 0.3305, train acc 0.877, test acc 0.871
epoch 16, loss 0.3239, train acc 0.879, test acc 0.879
epoch 17, loss 0.3152, train acc 0.882, test acc 0.876
epoch 18, loss 0.3108, train acc 0.885, test acc 0.871
epoch 19, loss 0.30