# 05-2、模型保存
当我们的模型训练好以后，需要对模型进行保存，在pytorch中有两种方式对模型进行保存，这一节我们将介绍如何对模型进行保存、加载模型进行预测以及加载模型继续训练。

## 1.保存、加载整个模型
保存了网络的结构和参数，容量比较大，占地方。   

In [21]:
import torch
from torchvision import datasets
from torch import nn
import torchvision
from torchvision.transforms import transforms
import torch.optim as optim
import torch.nn.functional as F

transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
     

])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = datasets.CIFAR10(root='./data', train=True,
                          download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16,
                                          shuffle=True)


testset = datasets.CIFAR10(root='./data', train=False,
                         download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=16,
                                         shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [18]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Sequential(     #input_size=(1*28*28)
            nn.Conv2d(3, 6, 5, 1, 2), #padding=2保证输入输出尺寸相同
            nn.ReLU(),      #input_size=(6*28*28)
            nn.MaxPool2d(kernel_size=2, stride=2),#output_size=(6*14*14)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(6, 16, 5),
            nn.ReLU(),      #input_size=(16*10*10)
            nn.MaxPool2d(2, 2)  #output_size=(16*5*5)
        )
        self.fc1 = nn.Sequential(
            nn.Linear(576, 120),
            nn.ReLU()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(120, 84),
            nn.ReLU()
        )
        self.fc3 = nn.Linear(84, 10)

    # 定义前向传播过程，输入为x
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        # nn.Linear()的输入输出都是维度为一的值，所以要把多维度的tensor展平成一维
        x = x.view(x.size()[0], -1)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [19]:
# 使用AlexNet进行训练
# AlexNet默认的参数太大了如果你不想等的时间太长可以将下面这句解除注释，把model = torchvision.models.alexnet(num_classes=10)注释掉
# model = alexnet(num_classes=10)
model = LeNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
epochs = 1
model

LeNet(
  (conv1): Sequential(
    (0): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=576, out_features=120, bias=True)
    (1): ReLU()
  )
  (fc2): Sequential(
    (0): Linear(in_features=120, out_features=84, bias=True)
    (1): ReLU()
  )
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [24]:
# 在训练过程中一般一个轮次训练结束后对测试集整体进行测试，获取测试集上的损失和准确率
losses = []
acces = []
eval_losses = []
eval_acces = []
for e in range(epochs):
    train_loss = 0
    train_acc = 0
    model.train() 
    for data, target in trainloader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        # loss.item()是平均损失，平均损失*batch_size=一次训练的损失
        train_loss += loss.item() * data.size(0)  
        _, pred = output.max(1)
        # 计算一个批次对了几个
        num_correct = (pred == target).sum().item()
        # 计算准确率=对的个数/批次大小
        acc = num_correct / data.shape[0]
        train_acc += acc
    # 统计一个轮次中平均损失与平均准确率
#     losses.append(train_loss / len(trainloader.dataset))
    acces.append(train_acc / len(trainloader))
    eval_loss = 0
    eval_acc = 0
    model.eval() # 将模型改为预测模式
    for im, label in testloader:
        out = model(im)
        loss = criterion(out, label)
        # 记录误差
        eval_loss += loss.item()
        # 记录准确率
        _, pred = out.max(1)
        num_correct = (pred == label).sum().item()
        acc = num_correct / im.shape[0]
        eval_acc += acc
    # 上面的代码对全部的测试集进行测试，下面两行就是讲这一次的整个测试集的损失和准确率存在列表里,如果进行可视化可能会用到
#     eval_losses.append(eval_loss / len(testloader.dataset))
    eval_acces.append(eval_acc / len(testloader))
    print('epoch: {}, Train Loss: {:.6f}, Train Acc: {:.6f}, Eval Loss: {:.6f}, Eval Acc: {:.6f}'
          .format(e, train_loss / len(trainloader), train_acc / len(trainloader), 
                     eval_loss / len(testloader), eval_acc / len(testloader)))

epoch: 0, Train Loss: 22.226948, Train Acc: 0.498400, Eval Loss: 1.278070, Eval Acc: 0.543500


In [25]:
# 模型保存
torch.save(model, './models/LeNet.pth')

In [27]:
# 模型加载
# 你可以只加载模型不需要网络的代码
model_ = torch.load( './models/LeNet.pth')
# 转成测试模式进行计算，如果想继续训练转换成train模式即可
model_.eval()
test_data = torch.rand((1, 3, 32, 32))
model(test_data)

tensor([[-0.0179, -1.4930,  1.2258,  0.4460,  1.0655, -0.0189,  0.9023, -1.0236,
         -0.1456, -1.2768]], grad_fn=<AddmmBackward>)

如果只想保存模型参数，有一个概念很重要，就是模型的状态字典：model.state_dict()，在PyTorch中，torch.nn.Module模型中可学习的参数（即重量和偏置）都包含在模型参数中（使用model.parameters()访问）。state_dict 是个将每层参数映射到对应的参数张量的python字典对象(OrderedDict)。注意:state_dict的条目仅包括带有可学习参数的层(卷积层，线性层等)和registered buffers(BN层的mean等)。优化器对象（torch.optim）也有state_dict ，它包含有关该优化器状态信息，以及所使用的超参数。由于state_dict对象是OrderedDict，它们可以方便地保存，更新，修改和恢复，方便PyTorch模型和优化器添加了大量的模块化。

参考：https://blog.csdn.net/sinat_24899403/article/details/102806957

In [29]:
# 只保存参数
torch.save(model.state_dict(), './models/LeNet_dict.pth')

In [31]:
# 如果加载只有参数的文佳 首先你要有网络的代码
PATH =  './models/LeNet_dict.pth'
model_dict = LeNet() 
print(torch.load(PATH).keys())
# 可以看到输出了一些权重值
model_dict.load_state_dict(torch.load(PATH))
model_dict.eval()
model_dict

odict_keys(['conv1.0.weight', 'conv1.0.bias', 'conv2.0.weight', 'conv2.0.bias', 'fc1.0.weight', 'fc1.0.bias', 'fc2.0.weight', 'fc2.0.bias', 'fc3.weight', 'fc3.bias'])


LeNet(
  (conv1): Sequential(
    (0): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=576, out_features=120, bias=True)
    (1): ReLU()
  )
  (fc2): Sequential(
    (0): Linear(in_features=120, out_features=84, bias=True)
    (1): ReLU()
  )
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

注意一下在使用GPU和CPU混合训练或者跨设备的时候，注意转换细节，看看报错信息，比如使用CPU加载GPU模型文件的时候torch.load函数要添加map_location='cpu'参数。还有更复杂的问题到时候百度就完事了。