1. 访问参数，用于调试，诊断和可视化
2. 参数初始化
3. 在不同模型组建间共享参数

In [1]:
import torch
from torch import nn

# this is an example
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[-0.2990],
        [-0.2700]], grad_fn=<AddmmBackward0>)

In [2]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.1173,  0.3169, -0.2477, -0.2517,  0.0818, -0.1098, -0.2563,  0.1627]])), ('bias', tensor([-0.0655]))])


In [4]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)
print(net[2].bias.grad) # 没有做反向计算，所以是none，计算之后就是他当前的梯度了

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0655], requires_grad=True)
tensor([-0.0655])
None


In [8]:
print(*[(name, param) for name, param in net.named_parameters()])

('0.weight', Parameter containing:
tensor([[ 0.3520, -0.1386,  0.0335, -0.4187],
        [ 0.2503, -0.4411, -0.1166, -0.1964],
        [-0.1074,  0.1204,  0.0162,  0.2591],
        [ 0.3747,  0.2571,  0.4487,  0.3704],
        [-0.2088,  0.3941, -0.0718, -0.1930],
        [ 0.1539, -0.3835, -0.4139, -0.2364],
        [ 0.3735, -0.0768, -0.0442,  0.0525],
        [ 0.1062, -0.4544, -0.4975, -0.3814]], requires_grad=True)) ('0.bias', Parameter containing:
tensor([-0.3084,  0.1636, -0.2396,  0.0022,  0.3327, -0.1113,  0.4797,  0.4141],
       requires_grad=True)) ('2.weight', Parameter containing:
tensor([[-0.1173,  0.3169, -0.2477, -0.2517,  0.0818, -0.1098, -0.2563,  0.1627]],
       requires_grad=True)) ('2.bias', Parameter containing:
tensor([-0.0655], requires_grad=True))


In [10]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

# 1是relu，relu没有参数，因此拿不出来

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [11]:
# 也可以直接用名字拿
net.state_dict()['2.weight'].data

tensor([[-0.1173,  0.3169, -0.2477, -0.2517,  0.0818, -0.1098, -0.2563,  0.1627]])

从嵌套块里收集参数

In [13]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[-0.1489],
        [-0.1489]], grad_fn=<AddmmBackward0>)

In [14]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [15]:
rgnet[0][1][2].weight.data

tensor([[-0.3028,  0.2185, -0.1444, -0.3343,  0.1974, -0.1635,  0.1679, -0.1406],
        [-0.1340, -0.1988, -0.2068, -0.1209,  0.2438, -0.0779,  0.2702,  0.0333],
        [ 0.3199, -0.1364,  0.1918, -0.2341,  0.0706,  0.2951, -0.1298,  0.3378],
        [-0.3073,  0.0435, -0.1892, -0.2856,  0.1665,  0.0789,  0.3357,  0.0075]])

参数初始化

1. 内置初始化
2. 自定义初始化