# 16 pytorch基础 + GPU使用和购买

## 1. 层和块

单独的层：接受输入 -> 产生输出

块：由单个或多个层组成，可以看作是层的组合，可以递归组合

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[-0.0324,  0.1930,  0.1136,  0.0561, -0.1197, -0.0341,  0.0474,  0.1003,
         -0.1171,  0.1128],
        [ 0.0125,  0.1131,  0.0332,  0.1121, -0.1705, -0.1838,  0.1163,  0.1166,
         -0.0539,  0.1968]], grad_fn=<AddmmBackward0>)

### 自定义块

块是nn.Module的子类，必须具有：

1. 一个__init__方法，用于创建模型参数，其中必须先调用super().__init__()

2. 一个forward方法，用于定义前向传播

In [3]:
class MLP_1(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.linear_1 = nn.Linear(in_features, hidden_features)
        self.linear_2 = nn.Linear(hidden_features, out_features)
    
    def forward(self, X):
        return self.linear_2(F.relu(self.linear_1(X))) # F.relu()和nn.ReLU()的区别：前者是函数可以直接调用，后者是类，必须要实例化的对象
    
net = MLP_1(20, 256, 10)
X = torch.randn(size=(2, 20))
print(net(X))

tensor([[ 0.0458, -0.3411, -0.0647,  0.2631,  0.0955, -0.1935, -0.0306, -0.0301,
          0.1263,  0.0469],
        [-0.0099, -0.2279, -0.0561,  0.3008,  0.3885, -0.0870,  0.1181, -0.1109,
         -0.1623,  0.3407]], grad_fn=<AddmmBackward0>)


### 顺序块(Sequential)

Sequential模块可以将多个网络层组合成一个网络，将这些层存入_modules字典中

在forward方法中，按照顺序执行它们

以下手搓一个MySequential块:

In [4]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[idx] = module
    
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X
    
net = MySequential(nn.Linear(10,512),nn.ReLU(),nn.Linear(512,1))
net(torch.rand(2,10))

tensor([[-0.2623],
        [-0.1976]], grad_fn=<AddmmBackward0>)

### 在前向传播函数中执行代码

一些更底层、更自定义化的实现方式，而非拘束于调用封装的层、函数等

如下例：

实现 $ w_1(ReLU(w_{rand}(w_1x+b_1)))+b_1$

In [5]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(20,20) # 这里两个线性层参数共享
        self.rand_w = torch.rand(20,20)

    def forward(self, X):
        X = self.linear(X)
        h = F.relu(torch.matmul(X,self.rand_w))
        return self.linear(h)
    
net = FixedHiddenMLP()
net(torch.rand(2,20))

tensor([[ 0.0085,  0.4203,  0.0649,  1.3703, -0.1977,  0.2106, -0.1088,  0.2462,
          0.1456,  0.2000,  0.9641, -0.1425, -0.4982, -0.0606, -0.1804, -0.0959,
          0.0254,  0.4742,  0.0198, -0.5820],
        [ 0.1497,  0.4888, -0.2205,  1.5827, -0.0253, -0.2441,  0.0189,  0.0553,
          0.7241,  0.3926,  1.6331, -0.0922, -0.8998, -0.3566, -0.6107,  0.4860,
          0.6308,  1.1412, -0.0549, -0.7823]], grad_fn=<AddmmBackward0>)

## 2. 参数管理

### 参数访问

net[i]可以访问第i层的网络

In [6]:
net = nn.Sequential(nn.Linear(4,16),nn.ReLU(),nn.Linear(16,1))
net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.1012, -0.1866, -0.1400, -0.2238, -0.1253,  0.1726,  0.0540,  0.0655,
                       -0.2176, -0.0089, -0.1738,  0.0210,  0.0695, -0.0764, -0.2446,  0.1849]])),
             ('bias', tensor([0.1091]))])

访问参数的w、b值及它们的梯度：

In [7]:
print(net[0].weight)
print(net[0].bias.data)
print(net[0].bias.grad==None) # 还没有进行反向传播，故暂时没有梯度

Parameter containing:
tensor([[-0.3260, -0.4019, -0.3354,  0.1142],
        [ 0.4497,  0.3500, -0.3073,  0.0094],
        [ 0.4868, -0.4663, -0.3741,  0.3678],
        [ 0.0011,  0.3849,  0.0680,  0.3203],
        [-0.0929,  0.1115, -0.4080, -0.1022],
        [ 0.0833, -0.3810, -0.4092,  0.0362],
        [-0.1935,  0.2632, -0.4682, -0.1094],
        [-0.1529, -0.3460, -0.4699, -0.3269],
        [ 0.1164, -0.2936, -0.4728, -0.0343],
        [ 0.3521,  0.3736,  0.4241,  0.3322],
        [-0.2596,  0.1671, -0.4429,  0.4177],
        [ 0.1447,  0.3855, -0.1637, -0.2036],
        [-0.0415,  0.1079,  0.2792, -0.1466],
        [ 0.0095, -0.3707, -0.0424, -0.2631],
        [-0.3543,  0.4511, -0.0423,  0.1472],
        [ 0.1498,  0.3066,  0.1196, -0.0420]], requires_grad=True)
tensor([ 0.0308,  0.4518, -0.4753,  0.1688,  0.3911,  0.0658, -0.1912, -0.4984,
        -0.2732, -0.1166,  0.1797, -0.3508, -0.4953, -0.2771, -0.3353, -0.1804])
True


### 一次性访问所有参数

In [8]:
print([(name, param.shape) for name, param in net[0].named_parameters()])
print([(name,param.shape) for name, param in net.named_parameters()])

[('weight', torch.Size([16, 4])), ('bias', torch.Size([16]))]
[('0.weight', torch.Size([16, 4])), ('0.bias', torch.Size([16])), ('2.weight', torch.Size([1, 16])), ('2.bias', torch.Size([1]))]


这也告诉我们可以如下操作：

In [9]:
print(net.state_dict()['0.weight'].data)

tensor([[-0.3260, -0.4019, -0.3354,  0.1142],
        [ 0.4497,  0.3500, -0.3073,  0.0094],
        [ 0.4868, -0.4663, -0.3741,  0.3678],
        [ 0.0011,  0.3849,  0.0680,  0.3203],
        [-0.0929,  0.1115, -0.4080, -0.1022],
        [ 0.0833, -0.3810, -0.4092,  0.0362],
        [-0.1935,  0.2632, -0.4682, -0.1094],
        [-0.1529, -0.3460, -0.4699, -0.3269],
        [ 0.1164, -0.2936, -0.4728, -0.0343],
        [ 0.3521,  0.3736,  0.4241,  0.3322],
        [-0.2596,  0.1671, -0.4429,  0.4177],
        [ 0.1447,  0.3855, -0.1637, -0.2036],
        [-0.0415,  0.1079,  0.2792, -0.1466],
        [ 0.0095, -0.3707, -0.0424, -0.2631],
        [-0.3543,  0.4511, -0.0423,  0.1472],
        [ 0.1498,  0.3066,  0.1196, -0.0420]])


### 从嵌套块收集参数

使用`net.add_module()`方法



In [10]:
X = torch.rand(2,4)
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.1791],
        [0.1791]], grad_fn=<AddmmBackward0>)

In [11]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


## 3.参数初始化

内置的初始化器：`nn.init.normal_()`, `nn.init.zeros_()`等

### 初始化为高斯分布:

In [12]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0047, -0.0020,  0.0069, -0.0057]), tensor(0.))

### 初始化为常数:

In [13]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

### Xavier初始化方法

In [14]:
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.1793,  0.5085,  0.5227, -0.2806])
tensor([[42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
         42., 42.]])


### 自定义初始化

In [15]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([16, 4])
Init weight torch.Size([1, 16])


tensor([[ 9.6866, -6.9685,  7.4098,  0.0000],
        [-0.0000,  0.0000,  0.0000,  6.2793]], grad_fn=<SliceBackward0>)

### 直接设置参数

In [16]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000, -5.9685,  8.4098,  1.0000])

### 定义共享参数层

In [17]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


## 4. 自定义层

类似块的定义

### 不带参数的自定义层

In [18]:
import torch
import torch.nn.functional as F
from torch import nn


class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

In [19]:
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

可以将层作为组件合并到更复杂的模型中

In [20]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

### 带参数的层

在层中定义参数（例如，权重和偏置）的一般方式是使用`nn.Parameter`类。下面是一个例子：

In [21]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

访问自定义层的参数：

In [22]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[ 0.0332,  0.0591,  1.2076],
        [ 0.8346, -1.1658, -2.1580],
        [-1.1543,  1.1331, -1.0558],
        [ 0.6031, -0.0344,  0.8949],
        [ 0.8334, -0.7193, -0.6285]], requires_grad=True)

用自定义层执行前向传播：

In [23]:
linear(torch.rand(2, 5))

tensor([[1.2113, 1.5755, 0.0000],
        [1.7097, 0.7933, 0.0000]])

## 5. 读写文件

### 张量读写

张量的读写操作是通过`torch.save()`和`torch.load()`函数实现的。

In [24]:
import torch
from torch import nn
from torch.nn import functional as F

x = torch.arange(4)
torch.save(x, './save/x-file')

x2 = torch.load('./save/x-file')
print(x2)

tensor([0, 1, 2, 3])


  x2 = torch.load('./save/x-file')


可以读写一个张量的数组：

In [25]:
y = torch.zeros(4)
torch.save([x,y], './save/x-y-file')

x2, y2 = torch.load('./save/x-y-file')
print(x2, y2)

tensor([0, 1, 2, 3]) tensor([0., 0., 0., 0.])


  x2, y2 = torch.load('./save/x-y-file')


可以读写映射到张量的字典：

In [26]:
mydict = {'x': x, 'y': y}
torch.save(mydict, './save/mydict')
mydict2 = torch.load('./save/mydict')
mydict2

  mydict2 = torch.load('./save/mydict')


{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

### 加载和保存模型参数

先自定义一个模型并实例化, 先计算一版对某个输入的前向传播结果：

In [27]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

将模型的参数存储在一个叫做“mlp.params”的文件中:

In [28]:
torch.save(net.state_dict(), './save/mlp.params')

加载模型参数:

In [29]:
clone = MLP()
clone.load_state_dict(torch.load('./save/mlp.params'))
clone.eval()

  clone.load_state_dict(torch.load('./save/mlp.params'))


MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

比较原模型和加载参数后的模型的结果，应当是一样的

In [30]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

## 6. GPU

查看设备的GPU信息：

In [31]:
!nvidia-smi

Sat Feb  1 17:03:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.07                 Driver Version: 566.07         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   56C    P8             11W /  100W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

通过pytorch获取计算设备:

In [32]:
import torch
from torch import nn

torch.device('cpu')

device(type='cpu')

In [33]:
torch.cuda.device('cuda')

<torch.cuda.device at 0x21cac7f41f0>

查看有多少个可用GPU:

In [34]:
torch.cuda.device_count() # 查看GPU数量

1

这两个函数允许我们在不存在所需所有GPU的情况下运行代码

In [35]:
def try_gpu(i=0):  #@save
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

### 张量与GPU

默认情况下，张量创建在CPU上

In [36]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

在GPU上创建张量

In [37]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

张量运算必须在同一个设备上, 不在同一个设备上的张量运算需要先复制

In [42]:
Y = torch.ones(2,3)
# X+Y # 错误, 不能对不在同一个设备上的张量进行运算
Z = Y.cuda(0)
X+Z # 正确, 两个张量都在GPU上

tensor([[2., 2., 2.],
        [2., 2., 2.]], device='cuda:0')

### 神经网络与GPU

In [44]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())

net(X)

tensor([[-0.7342],
        [-0.7342]], device='cuda:0', grad_fn=<AddmmBackward0>)

确认模型参数在同一个GPU上

In [45]:
net[0].weight.data.device

device(type='cuda', index=0)