# 参数管理

## example 1 hidden layer MLP(但隐藏层多层感知机)

In [1]:
import torch as tor
from torch import nn as tor_nn


In [2]:
net = tor_nn.Sequential(
    tor_nn.Linear(4, 8),tor_nn.ReLU(),
    tor_nn.Linear(8, 1)
)
X = tor.rand(size=(2, 4))
net(X)

tensor([[-0.0913],
        [-0.1703]], grad_fn=<AddmmBackward0>)

sequential 可以简单的理解为python中的表单(list)

因此我们可以通过一些办法去访问这个表单，就像我们使用索引去访问数组一样

In [3]:
print(net[0].state_dict())
print(net[1].state_dict())
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0188,  0.3250,  0.4889, -0.3333],
        [ 0.4175, -0.4646,  0.0190,  0.3603],
        [ 0.3477, -0.1676, -0.4880,  0.1333],
        [-0.2342,  0.0826,  0.0449, -0.3643],
        [ 0.3214,  0.1281, -0.4591,  0.4121],
        [ 0.1273, -0.3304,  0.4242,  0.0339],
        [ 0.3894,  0.0007, -0.5000,  0.0784],
        [-0.4300,  0.2715,  0.2679,  0.0498]])), ('bias', tensor([-0.1399, -0.3402, -0.4977, -0.2135,  0.4336,  0.3656,  0.2576,  0.2798]))])
OrderedDict()
OrderedDict([('weight', tensor([[ 0.3208, -0.1108,  0.2642,  0.0568, -0.3127, -0.2750,  0.3423, -0.0592]])), ('bias', tensor([0.2030]))])


我们可以看到我们的计数是从0开始的，然后到2，其中我们有:
- 第一层Linear(4, 8)
- 第二层ReLU(这玩意没什么参数)
- 第三层Linear(8, 1)

In [4]:
print(type(net[2].bias)) # 类型
print(net[2].bias)       # 详细参数
print(net[2].bias.data)  # 数据

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.2030], requires_grad=True)
tensor([0.2030])


感觉就是一个C的链表

In [5]:
print(net[2].weight)
print(net[2].weight.grad) # 这里我们看到是None应为我们话没有进行反向传播所以是None

Parameter containing:
tensor([[ 0.3208, -0.1108,  0.2642,  0.0568, -0.3127, -0.2750,  0.3423, -0.0592]],
       requires_grad=True)
None


### 一次性访问所有参数

In [6]:
print("单个第0层参数")
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print("多层所有参数")
print(*[(name, param.shape) for name, param in net.named_parameters()])


单个第0层参数
('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
多层所有参数
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


名字作为索引访问(感觉像Lua)

In [7]:
net.state_dict()['2.weight'].data

tensor([[ 0.3208, -0.1108,  0.2642,  0.0568, -0.3127, -0.2750,  0.3423, -0.0592]])

### 嵌套块收集参数

In [8]:
def block1():
    return tor_nn.Sequential(
        tor_nn.Linear(4, 8), tor_nn.ReLU(),
        tor_nn.Linear(8, 4), tor_nn.ReLU()
    )

def block2():
    net = tor_nn.Sequential() # 表明类型创建一个空表，你可以看成一个类型指代
    for i in range(4):
        net.add_module(f'block{i}', block1()) # add_module的作用是我们可以去编排net中层的名字，就不用是0~n了
    return net

In [9]:
rgnet = tor_nn.Sequential(block2(), tor_nn.Linear(4, 1))
rgnet(X)

tensor([[0.1057],
        [0.1056]], grad_fn=<AddmmBackward0>)

展示所有这个网络的构成

In [10]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


## 参数初始化

### 内置初始化

In [17]:
def init_normal(m):
    if(type(m) == tor_nn.Linear):
        tor_nn.init.normal_(m.weight, mean=0, std=0.01) # 正态分布均值为0，方差0.01 |normal_是一种pytorch的写法，就是直接替换
        tor_nn.init.zeros_(m.bias)                      # b参数全归0
        
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0028,  0.0044,  0.0108,  0.0009]), tensor(0.))

In [13]:
def init_constant(m):
    if type(m) == tor_nn.Linear:
        tor_nn.init.constant_(m.weight, 1) # 直接固定为确信值
        tor_nn.init.zeros_(m.bias)
        
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

某些应用不同的初始化方法

In [18]:
def xavier(m):
    if(type(m) == tor_nn.Linear):
        tor_nn.init.xavier_uniform_(m.weight)
        
def init_42(m):
    if(type(m) == tor_nn.Linear):
        tor_nn.init.constant_(m.weight, 42)
        
net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.5562, -0.3808,  0.1112, -0.0838])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


注意这里，apply就是表示加入进入模型并且直接进行对应运行的操作，apply在加入的时候就已经进行了运行。很符合一种插入过程之中的感觉（就像往三明治里面加肉加菜）！

In [25]:
def my_init(m):
    if(type(m) == tor_nn.Linear):
        print(
            "Init",
            *[(name, param.shape) for name, param in m.named_parameters()][0] # [0]是weight, [1]是bias
        )
        tor_nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5
        
net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 0.0000, -7.6605,  0.0000, -0.0000],
        [-0.0000, -8.1597, -0.0000,  9.8050]], grad_fn=<SliceBackward0>)

还有一种就是直接做操作

In [26]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data, net[0].weight.data[0]

(tensor([[42.0000, -6.6605,  1.0000,  1.0000],
         [ 1.0000, -7.1597,  1.0000, 10.8050],
         [10.1559,  1.0000,  1.0000,  1.0000],
         [-8.8623,  1.0000,  1.0000,  7.7619],
         [-7.2341,  1.0000,  1.0000,  9.6321],
         [ 1.0000,  1.0000,  1.0000,  1.0000],
         [ 1.0000,  1.0000,  9.7660,  8.1742],
         [ 6.3477,  1.0000, -8.2250,  1.0000]]),
 tensor([42.0000, -6.6605,  1.0000,  1.0000]))

### 参数绑定(共享权重)

其实想就是说我们的参数1与另一个参数变为一致的

In [27]:
shared = tor_nn.Linear(8, 8)
net = tor_nn.Sequential(
    tor_nn.Linear(4, 8), # 0
    tor_nn.ReLU(),       # 1
    shared,              # 2
    tor_nn.ReLU(),       # 3
    shared,              # 4
    tor_nn.ReLU(),       # 5
    tor_nn.Linear(8, 1)  # 6
)

net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
