In [1]:
import torch
from torch import nn
from d2l import torch as d2l

In [2]:
def dropout_layer(input, p):
    assert 0 <= p <= 1
    if p == 0:
        return input
    if p == 1:
        return torch.zeros_like(input)
    mask = (torch.randn(input.shape) > p).float()
    return input * mask / (1.0 - p)

In [3]:
X = torch.arange(16, dtype=torch.float32).reshape((2, 8))
print(X)
print(dropout_layer(X, 0.))  # 不变
print(dropout_layer(X, 0.5))  # 随机
print(dropout_layer(X, 1.))  # 全0

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])
tensor([[ 0.,  2.,  0.,  0.,  0., 10.,  0., 14.],
        [ 0., 18., 20., 22.,  0., 26.,  0.,  0.]])
tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])


## 简洁实现

In [5]:
# Dropout是对多层感知机控制的常用方法，一般只会用在全连接层，通常取0.1，0.5，0.9
# 可以将层数多设置一些，Dropout设置大一些
# 反向传播时，被Dropout的那些权重在该轮不会更新
# Dropout可能会使得收敛速度变慢，因为每次反向传播只有一些权重在更新
# 更开放的用法：对数据和标签进行Dropout...
net = nn.Sequential(nn.Flatten(),
                    nn.Linear(784, 256),
                    nn.ReLU(),
                    nn.Dropout(0.5),
                    nn.Linear(256, 256),
                    nn.ReLU(),
                    nn.Dropout(0.5),
                    nn.Linear(256, 10))

def init_weights(layer):
    if type(layer) == nn.Linear:
        nn.init.normal_(layer.weight, std=0.01)

net.apply(init_weights)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=256, out_features=256, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.5, inplace=False)
  (7): Linear(in_features=256, out_features=10, bias=True)
)