### 层和块

- 顺序块的实现

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args:
            self._modules[block] = block
    
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
X = torch.randn(2, 20)
net(X)

tensor([[-0.1842,  0.3863, -0.2438, -0.2552,  0.0299,  0.0508, -0.1792,  0.0092,
         -0.1979,  0.3025],
        [-0.2341, -0.1252, -0.0686,  0.1946, -0.1545, -0.2119,  0.0349,  0.2769,
          0.0106,  0.6139]], grad_fn=<AddmmBackward0>)

- 混合搭配各种组合块方法

In [2]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()
net = FixedHiddenMLP()
net(X)

tensor(0.1879, grad_fn=<SumBackward0>)

- 混合搭配各种组合块的方法

In [3]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)
    
    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(0.0470, grad_fn=<SumBackward0>)

### 参数管理
- Sequential可以看作是python的一个list 
- torch里面的API带下划线_基本上表示直接改变里面的值
- 模型中使用参数的.data访问参数数值

In [4]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.randn(2, 4)
net(X)

tensor([[-0.3124],
        [ 0.4515]], grad_fn=<AddmmBackward0>)

- 参数访问

In [5]:
print(net[2].state_dict())
# 参数数据类型
print(type(net[2].bias))
# 提取bias
print(net[2].bias)
# 进一步访问数值
print(net[2].bias.data)

OrderedDict([('weight', tensor([[-0.1148,  0.3398, -0.3069, -0.1089, -0.3260,  0.3305, -0.1486, -0.2178]])), ('bias', tensor([0.2297]))])
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.2297], requires_grad=True)
tensor([0.2297])


In [6]:
# 由于没有调用反向传播，所以所有参数的梯度都处于None状态
net[2].weight.grad == None

True

- 一次性访问所有参数

In [7]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [8]:
net.state_dict()['2.bias'].data

tensor([0.2297])

- 从嵌套块收集参数

In [9]:
def block1():
    net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                        nn.Linear(8, 4), nn.ReLU())
    return net
def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}', block1())
    return net
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.0830],
        [0.0822]], grad_fn=<AddmmBackward0>)

In [10]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [11]:
print(rgnet[0][1][0].bias.data)
print(type(rgnet[0][1][0].bias))
print(rgnet[0][1][0].bias)

tensor([-0.1244, -0.1087,  0.4203, -0.4897,  0.3622,  0.2127, -0.1863, -0.2559])
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.1244, -0.1087,  0.4203, -0.4897,  0.3622,  0.2127, -0.1863, -0.2559],
       requires_grad=True)


#### 参数初始化
- 使用内置初始化器

In [12]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight.data, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0111, -0.0061, -0.0123,  0.0035]), tensor(0.))

- 初始化为常数

In [13]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]
net[0].weight.data, net[0].bias.data

(tensor([[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [14]:
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data[0])

tensor([-0.6396,  0.6480,  0.0443,  0.7047])
tensor([42., 42., 42., 42., 42., 42., 42., 42.])


- 自定义初始化

In [15]:
def my_init(m):
    if type(m) == nn.Linear:
        print('Init', *[(name, param.shape) 
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5
net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 9.7359, -0.0000, -0.0000, -6.8115],
        [-0.0000, -6.7307,  0.0000, -0.0000]], grad_fn=<SliceBackward0>)

#### 参数绑定
- 有时需要在多个层间共享参数，使用参数绑定

In [16]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])
net.state_dict()

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


OrderedDict([('0.weight',
              tensor([[ 0.4563,  0.0305,  0.2441, -0.4196],
                      [-0.0862,  0.3386,  0.3165, -0.0239],
                      [-0.0228, -0.0593, -0.4483, -0.2176],
                      [-0.4503,  0.0097, -0.3838, -0.2933],
                      [ 0.0222, -0.2507, -0.0928,  0.4842],
                      [ 0.2944, -0.3450, -0.2852,  0.2840],
                      [-0.0275,  0.0127,  0.0984, -0.4864],
                      [-0.4096,  0.1866,  0.3745,  0.0195]])),
             ('0.bias',
              tensor([ 0.3717, -0.0309,  0.0044, -0.0932, -0.4675, -0.4669, -0.1027,  0.2987])),
             ('2.weight',
              tensor([[ 1.0000e+02,  1.0000e+02,  1.0000e+02,  1.0000e+02,  1.0000e+02,
                        1.0000e+02,  1.0000e+02,  1.0000e+02],
                      [-1.1549e-01, -1.2120e-02,  1.2577e-01,  1.0400e-01, -6.2232e-02,
                       -7.6307e-02, -1.6271e-01,  1.9659e-02],
                      [-8.0348e-02, -9.617

### 自定义层

In [17]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, X):
        return X - X.mean()
layer = CenteredLayer()
layer(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32))

tensor([-2., -1.,  0.,  1.,  2.])

In [18]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

In [19]:
Y = net(torch.randn(4, 8))
Y.mean()

tensor(-1.6764e-08, grad_fn=<MeanBackward0>)

- 实现带参数的自定义层

In [21]:
class Mylinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)
linear = Mylinear(5, 3)
linear.weight

Parameter containing:
tensor([[ 0.2057, -0.5113, -0.5806],
        [ 0.2966, -0.9890,  1.0598],
        [-0.1841, -0.8341,  0.6299],
        [ 0.4241, -1.1316,  1.0118],
        [ 1.7035,  0.1831,  1.3404]], requires_grad=True)

In [22]:
linear(torch.randn(2, 5))

tensor([[0.0000, 0.1081, 1.5533],
        [2.1240, 0.0000, 2.8896]])

In [23]:
net = nn.Sequential(Mylinear(64, 8), Mylinear(8, 1))
net(torch.randn(2, 64))

tensor([[6.5641],
        [3.3482]])

### 读写文件

In [25]:
x = torch.arange(4)
x

tensor([0, 1, 2, 3])

In [27]:
torch.save(x, 'x-file')
x2 = torch.load('x-file')
x2

tensor([0, 1, 2, 3])

In [29]:
y = torch.zeros(4)
torch.save([x, y], 'x-files')
x2, y2 = torch.load('x-files')
x2, y2

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [30]:
mydict = {'x':x, 'y':y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

- 加载和保存模型参数

In [31]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)
    def forward(self, X):
        return self.output(F.relu(self.hidden(X)))

net = MLP()
X = torch.randn(2, 20)
net(X)


tensor([[-0.0287,  0.3186,  0.2208, -0.1492, -0.2727,  0.0072,  0.1323,  0.2184,
          0.4681,  0.1402],
        [ 0.0986,  0.5118,  0.3340,  0.2531, -0.0305,  0.3427,  0.3647,  0.6832,
          0.2802, -0.5067]], grad_fn=<AddmmBackward0>)

In [33]:
torch.save(net.state_dict(), 'mlp.params')

In [36]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

- 直接保存网络结构和参数

In [37]:
model = nn.Sequential(nn.Linear(20, 128), nn.ReLU(), nn.Linear(128, 20))
torch.save(model, 'model.pt')
model2 = torch.load('model.pt')
model2

Sequential(
  (0): Linear(in_features=20, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=20, bias=True)
)