In [23]:
import torch
import torch.nn as nn
import copy

def print_(model):
    for name, params in model.named_parameters():
        print('-->name:', name)
        print('-->para:', params)
        # print('-->grad_requires:', params.requires_grad)
        print('-->grad_value:', params.grad)
        print("===")


class simpleModel(nn.Module):
    def __init__(self):
        super(simpleModel, self).__init__()
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding='same', bias=False)
        self.linear = nn.Linear(in_features=4, out_features=4, bias=False)

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x


x = torch.rand([4, 1, 2, 2])
y = torch.rand([4, 4])
x1 = torch.rand([4, 1, 2, 2])
y1 = torch.rand([4, 4], requires_grad=True)
model = simpleModel()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
model_dict = copy.deepcopy(model.state_dict())
optimizer1 = torch.optim.Adam(model.conv.parameters(), lr=1e-3)    # 只更新conv的参数
loss_fn = nn.CrossEntropyLoss()


In [24]:
out = model(x)
loss1 = loss_fn(out, y)
print("原参数")
print_(model)
optimizer.zero_grad()
print("梯度清零")
print_(model)
loss1.backward()
print("反向传播")
print_(model)
optimizer.step()  # step后模型参数才会更新
print("参数更新")
print_(model)

原参数
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)
-->grad_value: None
===
-->name: linear.weight
-->para: Parameter containing:
tensor([[-0.1398,  0.1009, -0.2804,  0.3269],
        [-0.1990,  0.0875,  0.4075,  0.2321],
        [-0.2026, -0.0283, -0.2613,  0.0742],
        [ 0.0089, -0.3535,  0.4244,  0.1762]], requires_grad=True)
-->grad_value: None
===
梯度清零
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)
-->grad_value: None
===
-->name: linear.weight
-->para: Parameter containing:
tensor([[-0.1398,  0.1009, -0.2804,  0.3269],
        [-0.1990,  0.0875,  0.4075,  0.2321],
        [-0.2026, -0.0283, -0.2613,  0.0742],
        [ 0.0089, -0.3535,  0.4244,  0.1762]], requires_grad=True)
-->grad_value: No

In [25]:
model.load_state_dict(model_dict)
print("再次加载原模型")
print_(model)           # 只加载了参数 梯度没变
loss2 = loss_fn(y1, y)
optimizer1.zero_grad()  # 只会清除conv的参数梯度
print("梯度清零")
print_(model)
loss2.backward()    # 反向传播会计算网络中所有的参数梯度，但由于没有经过模型计算，所以没有模型梯度计算
print("再次反向传播")
print_(model)
optimizer1.step()   # optimizer1 只根据conv的参数梯度来更新conv的参数
print("再次更新")
print_(model)

再次加载原模型
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)
-->grad_value: tensor([[[[ 0.0124,  0.0935,  0.1211],
          [ 0.0403,  0.1514,  0.1282],
          [ 0.0226,  0.0079, -0.0032]]]])
===
-->name: linear.weight
-->para: Parameter containing:
tensor([[-0.1398,  0.1009, -0.2804,  0.3269],
        [-0.1990,  0.0875,  0.4075,  0.2321],
        [-0.2026, -0.0283, -0.2613,  0.0742],
        [ 0.0089, -0.3535,  0.4244,  0.1762]], requires_grad=True)
-->grad_value: tensor([[-0.0397, -0.0126, -0.0421, -0.0322],
        [ 0.1288,  0.0422,  0.1439,  0.1035],
        [-0.0876, -0.0290, -0.0951, -0.0706],
        [-0.0015, -0.0005, -0.0067, -0.0006]])
===
梯度清零
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)
-

In [26]:
model.load_state_dict(model_dict)
print("再次加载原模型")
print_(model)
out1 = model(x1)
loss2 = loss_fn(out1, y)
optimizer1.zero_grad()
print("参数清零")
print_(model)
loss2.backward()    # 反向传播会计算网络中所有的参数梯度
print("再次反向传播")
print_(model)
optimizer1.step()   # optimizer1 只根据conv的参数梯度来更新conv的参数
print("再次更新")
print_(model)

再次加载原模型
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)
-->grad_value: tensor([[[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]]])
===
-->name: linear.weight
-->para: Parameter containing:
tensor([[-0.1398,  0.1009, -0.2804,  0.3269],
        [-0.1990,  0.0875,  0.4075,  0.2321],
        [-0.2026, -0.0283, -0.2613,  0.0742],
        [ 0.0089, -0.3535,  0.4244,  0.1762]], requires_grad=True)
-->grad_value: tensor([[-0.0397, -0.0126, -0.0421, -0.0322],
        [ 0.1288,  0.0422,  0.1439,  0.1035],
        [-0.0876, -0.0290, -0.0951, -0.0706],
        [-0.0015, -0.0005, -0.0067, -0.0006]])
===
参数清零
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)
-->grad_value: tensor([[[[0., 0., 0.],
       

In [27]:
model.load_state_dict(model_dict)
print("再次加载原模型")
print_(model)
out1 = model(x1)
loss2 = loss_fn(out1, y)
# optimizer1.zero_grad()
print("参数不清零")   # 不清零后
print_(model)
loss2.backward()    # 反向传播会计算网络中所有的参数梯度，由于不清零，所以此时的模型梯度为清零时的两倍
print("再次反向传播")
print_(model)
optimizer1.step()   # optimizer1 只根据conv的参数梯度来更新conv的参数
print("再次更新")
print_(model)

再次加载原模型
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)
-->grad_value: tensor([[[[ 0.0242,  0.2385,  0.1565],
          [ 0.0344,  0.1328,  0.1689],
          [ 0.0174,  0.0279, -0.0100]]]])
===
-->name: linear.weight
-->para: Parameter containing:
tensor([[-0.1398,  0.1009, -0.2804,  0.3269],
        [-0.1990,  0.0875,  0.4075,  0.2321],
        [-0.2026, -0.0283, -0.2613,  0.0742],
        [ 0.0089, -0.3535,  0.4244,  0.1762]], requires_grad=True)
-->grad_value: tensor([[-0.1014, -0.0382, -0.0966, -0.0670],
        [ 0.3467,  0.1282,  0.3387,  0.2350],
        [-0.2324, -0.0883, -0.2226, -0.1565],
        [-0.0129, -0.0017, -0.0195, -0.0115]])
===
参数不清零
-->name: conv.weight
-->para: Parameter containing:
tensor([[[[ 0.0248,  0.1663,  0.0988],
          [ 0.1486,  0.2089,  0.2992],
          [-0.0483, -0.0198,  0.2775]]]], requires_grad=True)


总结：pytorch的更新机制，loss先通过反向传播计算出所以经过正向传播的网络的梯度，优化器再通过step()将梯度应用到优化器指定参数的更新上