# 深度学习计算
## 层和块
- **块**可以描述单个层、由多个层组成的组件或整个模型本身：
![多层网络组合成块图](../image/多层网络组合成块.jpg)
- 块由类表示。它的任何子类都必须定义一个将其输入转换为输出的前向传播函数，并且必须存储任何必需的参数（有些块不需要任何参数），最后，为了计算梯度，块必须具有反向传播函数。

In [1]:
import numpy as np
import torch
import torch.nn as nn

# 定义网络结构
net = nn.Sequential(
    nn.Linear(20, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)

# 初始化权重（kaiming均匀初始化，这是PyTorch默认的初始化方法，比较适合relu函数，一般不需要手动写，这里只是第一次了解适应一下，一般PyTorch的神经网络层会自动初始化权重和偏置）
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias) # 默认偏置初始化为0

net.apply(init_weights)

# 生成输入数据
X = torch.empty(2, 20).uniform_() # empty生成的是随机张量，uniform可以直接修改为均匀分布的随机数

# 前向计算
print(net(X))

tensor([[ 0.0488,  0.5612,  0.9806,  0.3183,  1.1335, -0.5094, -1.4717, -0.3531,
         -0.0933, -0.8531],
        [ 0.2146,  0.5187,  0.5918, -0.0031,  1.4823,  0.8675, -1.0937,  0.7217,
         -0.3026,  0.0085]], grad_fn=<AddmmBackward0>)


### 自定义块
- 必须要实现的一些功能：
    - 将输入数据作为其前向传播函数的参数；
    - 通过前向传播函数来生成输出（输入输出的形状根据实际设置）；
    - 计算其输出关于输入的梯度，可通过其反向传播函数进行访问（继承了 $\mathrm{PyTorch}$ 的模型的话，这个过程可以自动实现）；
    - 存储和访问前向传播计算所需的参数（继承了 $\mathrm{PyTorch}$ 的模型的话，这个也不需要手动实现）；
    - 自行初始化模型参数；

In [2]:
class MLP(nn.Module):
    # 自定义初始化声明神经网络层
    def __init__(self, **kwargs):
        super().__init__(**kwargs) # 继承一下父类，可以省去一些函数代码的重新实现
        self.hidden = nn.Linear(20, 256)
        self.relu = nn.ReLU()
        self.out = nn.Linear(256, 10)
        
    def forward(self, X):
        X = self.hidden(X)
        X = self.relu(X)
        return self.out(X)

In [3]:
net = MLP()
net(X)

tensor([[ 0.0469,  0.2463,  0.0863, -0.1826,  0.1999, -0.0834, -0.0234,  0.2893,
         -0.0872,  0.1544],
        [-0.0553,  0.3012,  0.0235, -0.2252,  0.1415, -0.1559, -0.0553,  0.2332,
         -0.0359,  0.2801]], grad_fn=<AddmmBackward0>)

### 顺序块
- $\mathrm{Sequential}$ 类的工作：串联其他模块。
- 自定义的 $\mathrm{Sequential}$ 只需定义：一种将块逐个追加到列表中的函数；一种前向传播函数，用于将输入按追加块的顺序传递给块组成的“链条”。

In [4]:
from collections import OrderedDict # 这个字典会严格记录键值对的插入顺序，遍历时也会按照这个顺序返回，这种顺序性在定义网络层的时候非常适用


class MySequential(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._modules = OrderedDict() # PyTorch使用_modules管理子模块
        
    def add(self, name, module):
        self._modules[name] = module
        
    def forward(self, X):
        for module in self._modules.values():
            X = module(X)
        return X
    
net = MySequential()
net.add("Dense1", nn.Linear(20, 256))
net.add("ReLu", nn.ReLU())
net.add("Dense2", nn.Linear(256, 10))
net(X)

tensor([[ 0.2800,  0.4588, -0.0936,  0.1403,  0.0847,  0.0431, -0.0910, -0.0711,
         -0.2301,  0.1852],
        [ 0.1910,  0.3173, -0.1902,  0.1563,  0.1460,  0.2344, -0.1021, -0.0675,
         -0.1045,  0.0204]], grad_fn=<AddmmBackward0>)

### 前向传播函数执行
- 有时我们可能希望合并既不是上一层的结果也不是可更新参数的项，我们称之为常数参数（也就是有一些参数可能是指定不会更新的常量），或者需要使用Python控制流，这样就需要更灵活的前向传播函数。
- 示例 $f(\mathrm{x,w})=c\cdot \mathrm{w^Tx}$（下述示例只是展示这么一个原理流程，最后那个循环限制一般不会应用在实际任务中）：

In [5]:
class FixedHiddenMLP(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        """设置不会随训练更新的参数常量权重"""
        self.register_buffer(
            'rand_weight',
            torch.from_numpy(np.random.uniform(size=(20, 20)).astype(np.float32))
        )
        self.dense = nn.Linear(20, 20)
        self.relu = nn.ReLU()
        
    def forward(self, X):
        X = self.relu(self.dense(X))
        
        X = torch.relu(X @ self.rand_weight + 1) # @是PyTorch里面定义的矩阵乘法
        
        X = self.relu(self.dense(X))
        
        while X.abs().sum() > 1:
            X /= 2
            
        return X.sum()
    
net = FixedHiddenMLP()
print(net(X))

tensor(0.6320, grad_fn=<SumBackward0>)


- 混合搭配组合块：

In [6]:
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.net = nn.Sequential(
            nn.Linear(20, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.dense = nn.Linear(32, 16)
        self.relu = nn.ReLU()
        
    def forward(self, X):
        X = self.net(X)
        return  self.relu(self.dense(X))
    
"""构建完整模型"""
chimera = nn.Sequential(
    NestMLP(), # 嵌套网络块
    nn.Linear(16, 20),
    FixedHiddenMLP() # 固定参数网络块
)

chimera(X)

tensor(0.5896, grad_fn=<SumBackward0>)

## 参数管理
- 访问参数，用于调试、诊断和可视化；
- 参数初始化；
- 在不同模型组件间共享参数；

In [7]:
net = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    nn.Linear(8, 1),
)

X = torch.empty(2, 4).uniform_()
net(X)

tensor([[0.0856],
        [0.1191]], grad_fn=<AddmmBackward0>)

### 参数访问
- 当通过 $\mathrm{Sequential}$ 类定义模型时，我们可以通过索引来访问模型的任意层。

In [8]:
print(list(net[2].parameters())) # 访问某第三层的参数

[Parameter containing:
tensor([[ 0.1565, -0.2371,  0.0991,  0.0967,  0.0907, -0.0049, -0.2985,  0.2374]],
       requires_grad=True), Parameter containing:
tensor([0.1812], requires_grad=True)]


In [9]:
print(type(net[0].bias)) # 访问第一层的偏置
print(net[0].bias)
print(net[0].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.3782, -0.0009,  0.0471, -0.0647, -0.4824,  0.4135, -0.0837,  0.3016],
       requires_grad=True)
tensor([-0.3782, -0.0009,  0.0471, -0.0647, -0.4824,  0.4135, -0.0837,  0.3016])


In [10]:
print(net[0].weight.grad) # 由于未调用反向传播，故这里没有梯度

None


In [11]:
print(dict(net[0].named_parameters())) # 一次性访问所有参数（parameters()方法不包含参数名，这里这个方法包含参数名）
print(dict(net.named_parameters()))

{'weight': Parameter containing:
tensor([[ 0.1637,  0.0120,  0.0789,  0.2879],
        [-0.1766,  0.0839,  0.1395,  0.4501],
        [-0.2905,  0.4220, -0.1680,  0.4263],
        [ 0.3222, -0.0109, -0.0829, -0.0532],
        [ 0.1893,  0.2869, -0.4902, -0.3823],
        [ 0.3960, -0.3771,  0.4002,  0.1650],
        [-0.3039, -0.2589,  0.2812,  0.0850],
        [ 0.4714, -0.2655, -0.0577, -0.3836]], requires_grad=True), 'bias': Parameter containing:
tensor([-0.3782, -0.0009,  0.0471, -0.0647, -0.4824,  0.4135, -0.0837,  0.3016],
       requires_grad=True)}
{'0.weight': Parameter containing:
tensor([[ 0.1637,  0.0120,  0.0789,  0.2879],
        [-0.1766,  0.0839,  0.1395,  0.4501],
        [-0.2905,  0.4220, -0.1680,  0.4263],
        [ 0.3222, -0.0109, -0.0829, -0.0532],
        [ 0.1893,  0.2869, -0.4902, -0.3823],
        [ 0.3960, -0.3771,  0.4002,  0.1650],
        [-0.3039, -0.2589,  0.2812,  0.0850],
        [ 0.4714, -0.2655, -0.0577, -0.3836]], requires_grad=True), '0.bias': Par

In [12]:
"""从嵌套块收集参数"""
X = torch.empty(size=(2, 16)).uniform_()

def module1():
    return nn.Sequential(
        nn.Linear(16, 32),
        nn.ReLU(),
        nn.Linear(32, 16),
        nn.ReLU()
    )

def module2():
    modules = []
    for _ in range(4):
        modules.append(module1())
    return nn.Sequential(*modules)

rgnet = nn.Sequential(
    module2(),
    nn.Linear(16, 10)
)

rgnet(X)

tensor([[-0.0204, -0.0319, -0.0711,  0.2111,  0.1443, -0.0613, -0.0342, -0.1088,
         -0.1388, -0.1431],
        [-0.0202, -0.0319, -0.0713,  0.2115,  0.1443, -0.0614, -0.0341, -0.1087,
         -0.1384, -0.1434]], grad_fn=<AddmmBackward0>)

In [13]:
print(rgnet.named_parameters)

<bound method Module.named_parameters of Sequential(
  (0): Sequential(
    (0): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (3): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=16, out_features=10, bias=True)
)>


### 参数初始化
- 内置初始化，$\mathrm{PyTorch}$ 是直接内置默认初始化参数的，即像前述的一样某层网络会自行初始化参数。
- 自定义初始化：

In [14]:
"""逐层初始化（基础）"""
def custom_init(m):
    if isinstance(m, nn.Linear):
        # 均匀分布初始化 [-0.1, 0.1]
        nn.init.uniform_(m.weight, -0.1, 0.1)
        # 偏置初始化为1
        nn.init.constant_(m.bias, 1.0)

net = nn.Sequential(
    nn.Linear(8, 32),
    nn.ReLU(),
    nn.Linear(32, 4)
)

net.apply(custom_init)

Sequential(
  (0): Linear(in_features=8, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=4, bias=True)
)

In [15]:
print(net[0].bias.data)
print(net[0].weight.data)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
tensor([[ 0.0845, -0.0927, -0.0051, -0.0591, -0.0818, -0.0238, -0.0574, -0.0345],
        [ 0.0129,  0.0978,  0.0314,  0.0757, -0.0399, -0.0337,  0.0914, -0.0711],
        [ 0.0414,  0.0297, -0.0674, -0.0019,  0.0947, -0.0402,  0.0336,  0.0106],
        [ 0.0675,  0.0439, -0.0252, -0.0059,  0.0106,  0.0692,  0.0158, -0.0868],
        [ 0.0024, -0.0575,  0.0993, -0.0377,  0.0840,  0.0106, -0.0862, -0.0016],
        [-0.0972,  0.0099, -0.0154,  0.0882, -0.0650,  0.0459, -0.0416, -0.0865],
        [ 0.0739,  0.0625, -0.0157,  0.0348, -0.0401,  0.0072, -0.0614, -0.0939],
        [-0.0630,  0.0483, -0.0793, -0.0712,  0.0016, -0.0036,  0.0723, -0.0398],
        [-0.0302,  0.0622, -0.0696,  0.0739, -0.0906, -0.0824,  0.0484, -0.0959],
        [ 0.0737,  0.0826,  0.0531, -0.0345,  0.0983, -0.0931, -0.0440,  0.0517],
        [ 0.0052,  0.0222,  0.0493,

In [16]:
"""直接设置参数"""
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  0.9073,  0.9949,  0.9409,  0.9182,  0.9762,  0.9426,  0.9655])

### 参数绑定
- 多个层间共享参数：

In [17]:
net = nn.Sequential(
    nn.Linear(20, 8),
    nn.ReLU(),
    nn.Linear(8, 8),
    nn.ReLU(),
    nn.Linear(8, 8),
    nn.ReLU(),
    nn.Linear(8, 10),
)

"""共享参数层定义，第二层和第三层绑定"""
shared_layer = nn.Linear(8, 8)
net[2] = shared_layer
net[4] = shared_layer

X = torch.empty(size=(2, 20)).uniform_()
net(X)

print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
"""确保是对象为同一个，而不是相同值"""
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


## 延后初始化
- 延后初始化，即直到数据第一次通过模型传递时，框架才会动态地推断出每个层的大小。

**注**：如果使用延迟初始化配合优化器的话，要注意，优化器一定要在初始化参数之后再创建。

In [18]:
"""基础方法结合手动实现（再高一级的话可以使用装饰器，为任意的nn.module都添加延迟初始化方法）"""

class Net(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layer = None # 通过延迟初始化Sequential来进行延迟初始化
        
    def forward(self, X):
        if self.layer is None:
            self.layer = nn.Sequential(
                nn.Linear(X.shape[-1], 256),
                nn.ReLU(),
                nn.Linear(256, 10)
            ).to(X.device)
            
        return self.layer(X)
    
net = Net()

print(net.named_parameters)

<bound method Module.named_parameters of Net()>


In [19]:
X = torch.empty(size=(2, 20)).uniform_()
net(X)

print(net.named_parameters)
dict(net.named_parameters())

<bound method Module.named_parameters of Net(
  (layer): Sequential(
    (0): Linear(in_features=20, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=10, bias=True)
  )
)>


{'layer.0.weight': Parameter containing:
 tensor([[ 0.1826,  0.1373,  0.1924,  ...,  0.1936,  0.0229,  0.0384],
         [-0.0354, -0.1353,  0.0223,  ..., -0.0792, -0.0121, -0.0005],
         [ 0.2221, -0.1461, -0.0855,  ...,  0.0832, -0.0940, -0.1745],
         ...,
         [ 0.0410, -0.1213,  0.1663,  ..., -0.2152,  0.0192,  0.0083],
         [-0.2026,  0.0560, -0.1945,  ...,  0.1528,  0.1452,  0.0532],
         [ 0.1185, -0.0963,  0.1340,  ...,  0.1784,  0.0988,  0.0216]],
        requires_grad=True),
 'layer.0.bias': Parameter containing:
 tensor([-0.1624, -0.1978,  0.2120,  0.1177,  0.2041, -0.0354,  0.1369,  0.0089,
          0.1326,  0.0986,  0.1256,  0.0936, -0.2094,  0.0953,  0.2233,  0.2158,
         -0.0699, -0.2023,  0.1430,  0.1504, -0.0834,  0.0375,  0.1569,  0.0692,
          0.1154, -0.1165,  0.1314,  0.1882,  0.1652, -0.0318,  0.0725, -0.1844,
          0.0741,  0.0400, -0.1162, -0.1224, -0.1405,  0.0018,  0.1389, -0.1001,
          0.0668, -0.1692, -0.1461,  0.0586, 

## 自定义层
### 不带参数的层
- 要构建不带参数的层，我们只需继承基础层类并实现前向传播功能即可：

In [20]:
class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    @staticmethod
    def forward(X):
        return X - X.mean()
    
layer = CenteredLayer()
layer(np.array([1, 2, 3, 4, 5]))

array([-2., -1.,  0.,  1.,  2.])

In [21]:
net = nn.Sequential(
    nn.Linear(8, 128),
    CenteredLayer()
)

Y = net(torch.empty(size=(4, 8)).uniform_())
Y.mean()

tensor(4.6566e-09, grad_fn=<MeanBackward0>)

### 带参数的层

In [22]:
class MyDense(nn.Module):
    def __init__(self, units, in_units, **kwargs):
        super().__init__(**kwargs)
        self.weight = nn.Parameter(torch.Tensor(in_units, units))
        self.bias = nn.Parameter(torch.Tensor(units))
        
    def forward(self, X):
        linear = torch.matmul(X, self.weight) + self.bias
        return nn.functional.relu(linear)
    
dense = MyDense(units=3, in_units=5)
dict(dense.named_parameters())

{'weight': Parameter containing:
 tensor([[ 1.1049e+16,  1.4419e-42,  7.8869e-02],
         [ 2.8788e-01, -1.7655e-01,  8.3899e-02],
         [ 1.3952e-01,  4.5010e-01, -2.9045e-01],
         [ 4.2198e-01, -1.6805e-01,  4.2626e-01],
         [ 3.2222e-01, -1.0867e-02, -8.2945e-02]], requires_grad=True),
 'bias': Parameter containing:
 tensor([2.3048e-03, 1.8980e+01, 0.0000e+00], requires_grad=True)}

In [23]:
dense(torch.empty(size=(2, 5)).uniform_())

tensor([[3.1787e+15, 1.9171e+01, 2.5464e-02],
        [8.2646e+14, 1.9052e+01, 1.7230e-01]], grad_fn=<ReluBackward0>)

In [24]:
net = nn.Sequential(
    MyDense(8, in_units=64),
    MyDense(1, in_units=8)
)
net(torch.empty(size=(2, 64)).uniform_())

tensor([[1.1329e+16],
        [1.1344e+16]], grad_fn=<ReluBackward0>)

## 读写文件
### 加载保存张量
- $\mathrm{PyTorch}$ 在保存张量的的时候可以保留设备信息（加载时会自动恢复到原设备），可保存计算图、模型结构、梯度、设备信息，但是 $\mathrm{NumPy}$ 不行

In [27]:
x = np.arange(4)
torch.save(x, 'x-file.pt')

In [29]:
x2 = torch.load('x-file.pt', weights_only=False) # 默认为True文件里有些对象会加载不出来
x2

array([0, 1, 2, 3])

In [30]:
y = np.zeros(4)
torch.save([x, y], 'x-file.pt')
x2, y2 = torch.load('x-file.pt', weights_only=False)
(x2, y2)

(array([0, 1, 2, 3]), array([0., 0., 0., 0.]))

In [33]:
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict.pt')
mydict2 = torch.load('mydict.pt', weights_only=False)
mydict2

{'x': array([0, 1, 2, 3]), 'y': array([0., 0., 0., 0.])}

### 加载保存模型参数

In [42]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)
        
    def forward(self, X):
        X = nn.functional.relu(self.hidden(X))
        return self.output(X)
    
net = MLP()
X = torch.empty(size=(2, 20)).uniform_()
Y = net(X)  

In [44]:
torch.save(net.state_dict(), 'mlp.pth')

In [45]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.pth'))

<All keys matched successfully>

In [46]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

## $\mathrm{GPU}$ 加速训练

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

cpu_device = torch.device('cpu')
gpu_device_0 = torch.device('cuda:0')
gpu_device_1 = torch.device('cuda:1')

(device(type='cuda', index=0),
 device(type='cpu'),
 device(type='cuda', index=1))

In [55]:
tensor = torch.tensor([1, 2, 3])
tensor.to(cpu_device), tensor.to(gpu_device_0)

(tensor([1, 2, 3]), tensor([1, 2, 3], device='cuda:0'))

In [56]:
torch.cuda.device_count()

1

In [57]:
torch.cuda.get_device_name()

'NVIDIA GeForce GTX 1650'

### 张量与 $\mathrm{GPU}$

In [58]:
x = torch.tensor([1, 2, 3])
print(x.device)

cpu


In [60]:
X = torch.ones((2,3), device='cuda:0')
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

- 关于张量的复制或者计算，需要保证操作的张量都在通过一个设备下（都在 $\mathrm{CPU}$ 或者同一个 $\mathrm{GPU}$）

In [62]:
x.to('cuda:0') + X

tensor([[2., 3., 4.],
        [2., 3., 4.]], device='cuda:0')

### 神经网络与 $\mathrm{GPU}$

In [65]:
net = nn.Sequential(
    nn.Linear(3, 1),
    nn.ReLU()
)
net.to(device)
net(X)

tensor([[0.9704],
        [0.9704]], device='cuda:0', grad_fn=<ReluBackward0>)

In [70]:
net[0].weight.device

device(type='cuda', index=0)