# 1. 模型参数的访问

In [43]:
import torch
from torch import nn
from torch.nn import init

In [44]:
net = nn.Sequential(
    nn.Linear(4,3),
    nn.ReLU(),
    nn.Linear(3,1)
)
print(net)

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)


In [45]:
X = torch.rand(2,4)
Y = net(X).sum()

对于Sequential实例中的模型参数层，可以通过parameters()和named_parameters()访问（返回迭代器），后者还会返回名字

In [46]:
print(type(net.named_parameters()))
print(type(net.parameters()))

for name, param in net.named_parameters():
    print(name, param.size())
for param in net.parameters():
    print(param.size())

<class 'generator'>
<class 'generator'>
0.weight torch.Size([3, 4])
0.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])
torch.Size([3, 4])
torch.Size([3])
torch.Size([1, 3])
torch.Size([1])


In [47]:
# 访问net中单层的参数
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>


In [48]:
# Parameter是tensor子类，同时会被自动添加进模型的参数列表
class MyModule(nn.Module):
    def __init__(self, **kwargs):
        super(MyModule, self).__init__(**kwargs)
        self.weight1 = nn.Parameter(torch.rand(20,20))
        self.weight2 = torch.rand(20,20)
    def forward(self, x):
        pass

n = MyModule()
for name, param in n.named_parameters():
    print(name, param.size())

weight1 torch.Size([20, 20])


In [49]:
# 跟tensor一样，访问参数数值，访问梯度
weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad)
Y.backward()
print(weight_0.grad)

tensor([[ 0.1128, -0.0749, -0.2854, -0.0079],
        [-0.0461, -0.2120,  0.4907, -0.0095],
        [-0.0410, -0.4588,  0.3709, -0.3550]])
None
tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [0.1603, 0.0415, 0.1438, 0.1853],
        [0.6148, 0.3353, 0.5958, 0.6931]])


# 2. 模型参数的初始化

In [50]:
# N(0,0.01)初始化
for name, param in net.named_parameters():
    if "weight" in name:
        print("before")
        print(name, param.data)
        init.normal_(param, mean=0, std=0.01)
        print("after")
        print(name, param.data)

before
0.weight tensor([[ 0.1128, -0.0749, -0.2854, -0.0079],
        [-0.0461, -0.2120,  0.4907, -0.0095],
        [-0.0410, -0.4588,  0.3709, -0.3550]])
after
0.weight tensor([[-1.6624e-02, -1.5824e-02, -6.0067e-03,  5.9786e-05],
        [-9.5843e-03,  5.5858e-03, -2.0283e-02,  9.1880e-03],
        [-1.0010e-02, -6.1504e-03, -6.5615e-04,  2.5002e-03]])
before
2.weight tensor([[0.0259, 0.1901, 0.5680]])
after
2.weight tensor([[ 0.0150, -0.0055,  0.0026]])


In [54]:
# 常数初始化
for name, param in net.named_parameters():
    if "bias" in name:
        print("before")
        print(name, param.data)
        init.constant_(param, val=1)
        print("after")
        print(name, param.data)

before
0.bias tensor([0.0000, 0.0000, 7.3857])
after
0.bias tensor([1., 1., 1.])
before
2.bias tensor([-0.])
after
2.bias tensor([1.])


## 自定义初始化方法
我们可以自定义初始化的方法，参考一下torch.nn.init.normal_

In [55]:
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(mean, std)

发现其实就是一个原地改变tensor的值且不记录梯度的函数（parameters本身就是tensor子类）

In [56]:
# 自定义
# 一半概率初始化为0， 一半概率初始化为[-10,-5][5,10]的随机数
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10,10)
        tensor *= (tensor.abs() >= 5).float()

for name, param in net.named_parameters():
    if 'bias' in name:
        print('before')
        print(name, param.data)
        init_weight(param)
        print('after')
        print(name, param.data)


before
0.bias tensor([1., 1., 1.])
after
0.bias tensor([ 0.0000,  0.0000, -8.5646])
before
2.bias tensor([1.])
after
2.bias tensor([6.8773])


# 3. 模型参数的共享
两种方式    
1. Module类forward函数多次调用一层
2. 传入Sequential同一实例

In [58]:
linear = nn.Linear(1,1, bias=False)
net = nn.Sequential(linear, linear)
print(net)

for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)


Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight tensor([[3.]])


In [59]:
print(id(net[0]) == id(net[1]))

True


In [60]:
# 反向传播时，共享的参数的梯度会累加
x = torch.ones(1,1)
y = net(x).sum()
print(y)
y.backward()
print(net[0].weight.grad)

tensor(9., grad_fn=<SumBackward0>)
tensor([[6.]])
