# 16 pytorch基础

## 1. 层和块

单独的层：接受输入 -> 产生输出

块：由单个或多个层组成，可以看作是层的组合，可以递归组合

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[-0.0937,  0.1797, -0.0505,  0.1489, -0.0124,  0.0220,  0.0561,  0.1562,
         -0.1167, -0.0302],
        [-0.0859,  0.1789, -0.0683,  0.0039,  0.0464, -0.1014,  0.0598,  0.2357,
          0.0031, -0.0190]], grad_fn=<AddmmBackward0>)

### 自定义块

块是nn.Module的子类，必须具有：

1. 一个__init__方法，用于创建模型参数，其中必须先调用super().__init__()

2. 一个forward方法，用于定义前向传播

In [3]:
class MLP_1(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.linear_1 = nn.Linear(in_features, hidden_features)
        self.linear_2 = nn.Linear(hidden_features, out_features)
    
    def forward(self, X):
        return self.linear_2(F.relu(self.linear_1(X))) # F.relu()和nn.ReLU()的区别：前者是函数可以直接调用，后者是类，必须要实例化的对象
    
net = MLP_1(20, 256, 10)
X = torch.randn(size=(2, 20))
print(net(X))

tensor([[ 0.2158,  0.2293,  0.1619, -0.0421, -0.3486,  0.2489, -0.5542,  0.1339,
         -0.0258, -0.0579],
        [-0.0101,  0.3037,  0.1762,  0.1428, -0.3536,  0.0250, -0.2700, -0.2530,
         -0.0080,  0.1015]], grad_fn=<AddmmBackward0>)


### 顺序块(Sequential)

Sequential模块可以将多个网络层组合成一个网络，将这些层存入_modules字典中

在forward方法中，按照顺序执行它们

以下手搓一个MySequential块:

In [4]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[idx] = module
    
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X
    
net = MySequential(nn.Linear(10,512),nn.ReLU(),nn.Linear(512,1))
net(torch.rand(2,10))

tensor([[0.2604],
        [0.2672]], grad_fn=<AddmmBackward0>)

### 在前向传播函数中执行代码

一些更底层、更自定义化的实现方式，而非拘束于调用封装的层、函数等

如下例：

实现 $ w_1(ReLU(w_{rand}(w_1x+b_1)))+b_1$

In [5]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(20,20) # 这里两个线性层参数共享
        self.rand_w = torch.rand(20,20)

    def forward(self, X):
        X = self.linear(X)
        h = F.relu(torch.matmul(X,self.rand_w))
        return self.linear(h)
    
net = FixedHiddenMLP()
net(torch.rand(2,20))

tensor([[ 0.0602, -0.2257, -0.1723,  0.1205,  0.0465, -0.2156, -0.0561, -0.0472,
          0.0984,  0.0034, -0.2141,  0.0454,  0.2163, -0.0227,  0.1513,  0.1046,
          0.0141,  0.1319,  0.1106, -0.1454],
        [ 0.0317, -0.2820, -0.4336,  0.6500,  0.0677,  0.2772, -0.4030, -0.7115,
         -0.2275,  0.0409, -0.0601, -0.2752,  0.2198, -0.1042,  0.3759, -0.0593,
         -0.0254,  0.0237,  0.3004,  0.2488]], grad_fn=<AddmmBackward0>)

## 2. 参数管理

### 参数访问

net[i]可以访问第i层的网络

In [6]:
net = nn.Sequential(nn.Linear(4,16),nn.ReLU(),nn.Linear(16,1))
net[2].state_dict()

OrderedDict([('weight',
              tensor([[-0.0163,  0.0499, -0.0591,  0.1766, -0.0457,  0.0576, -0.0720, -0.0080,
                       -0.1819, -0.2467, -0.1974, -0.1756, -0.1169, -0.1266, -0.0771, -0.1057]])),
             ('bias', tensor([0.0731]))])

访问参数的w、b值及它们的梯度：

In [7]:
print(net[0].weight)
print(net[0].bias.data)
print(net[0].bias.grad==None) # 还没有进行反向传播，故暂时没有梯度

Parameter containing:
tensor([[ 0.1488, -0.4096, -0.1873,  0.1729],
        [-0.3364,  0.0039, -0.3798,  0.1748],
        [-0.2610,  0.2074, -0.1086,  0.3270],
        [-0.3054, -0.4790,  0.2081, -0.4523],
        [-0.4647, -0.2731, -0.3505, -0.4309],
        [-0.2665,  0.4562,  0.4150,  0.3067],
        [-0.1316, -0.1673, -0.0766, -0.2751],
        [-0.1977, -0.3928, -0.3569,  0.2064],
        [ 0.2682, -0.1237, -0.1002, -0.3673],
        [-0.2009, -0.4034,  0.4111,  0.3932],
        [ 0.2700, -0.0105, -0.0394,  0.1753],
        [ 0.0795, -0.0129,  0.0066,  0.4801],
        [ 0.4254, -0.3943, -0.4794, -0.2830],
        [ 0.3051, -0.3782, -0.4022, -0.4164],
        [ 0.0654,  0.2872,  0.1431,  0.3079],
        [ 0.3066,  0.3419,  0.0046, -0.2031]], requires_grad=True)
tensor([-0.1969,  0.4057, -0.4210,  0.1963, -0.4943,  0.3071,  0.2810, -0.0128,
         0.0702,  0.4643,  0.4901,  0.4625,  0.2548,  0.2357,  0.3704,  0.2693])
True


### 一次性访问所有参数

In [12]:
print([(name, param.shape) for name, param in net[0].named_parameters()])
print([(name,param.shape) for name, param in net.named_parameters()])

[('weight', torch.Size([16, 4])), ('bias', torch.Size([16]))]
[('0.weight', torch.Size([16, 4])), ('0.bias', torch.Size([16])), ('2.weight', torch.Size([1, 16])), ('2.bias', torch.Size([1]))]


这也告诉我们可以如下操作：

In [15]:
print(net.state_dict()['0.weight'].data)

tensor([[ 0.1488, -0.4096, -0.1873,  0.1729],
        [-0.3364,  0.0039, -0.3798,  0.1748],
        [-0.2610,  0.2074, -0.1086,  0.3270],
        [-0.3054, -0.4790,  0.2081, -0.4523],
        [-0.4647, -0.2731, -0.3505, -0.4309],
        [-0.2665,  0.4562,  0.4150,  0.3067],
        [-0.1316, -0.1673, -0.0766, -0.2751],
        [-0.1977, -0.3928, -0.3569,  0.2064],
        [ 0.2682, -0.1237, -0.1002, -0.3673],
        [-0.2009, -0.4034,  0.4111,  0.3932],
        [ 0.2700, -0.0105, -0.0394,  0.1753],
        [ 0.0795, -0.0129,  0.0066,  0.4801],
        [ 0.4254, -0.3943, -0.4794, -0.2830],
        [ 0.3051, -0.3782, -0.4022, -0.4164],
        [ 0.0654,  0.2872,  0.1431,  0.3079],
        [ 0.3066,  0.3419,  0.0046, -0.2031]])


### 从嵌套块收集参数

使用`net.add_module()`方法



In [17]:
X = torch.rand(2,4)
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.1684],
        [0.1684]], grad_fn=<AddmmBackward0>)

In [18]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


## 3.参数初始化