# 模型训练流程 笔记
教程视频链接：https://www.bilibili.com/video/BV1hE411t7RN

这篇笔记对应视频合集中的
- 完整的模型训练套路（一）
- 完整的模型训练套路（二）
- 完整的模型训练套路（三）

本节内容是前面几节内容的汇总，总结了从创建数据集到构建神经网络再到训练模型参数的过程，这篇笔记中的代码大部分与前面的相同。

## 1.准备数据集


In [1]:
import torchvision

train_data = torchvision.datasets.CIFAR10(root='../dataset', train=True, transform=torchvision.transforms.ToTensor(), download=False)
test_data = torchvision.datasets.CIFAR10(root='../dataset', train=False, transform=torchvision.transforms.ToTensor(), download=False)

#数据集长度
train_data_size = len(train_data)#50000
test_data_size = len(test_data)#10000
print(train_data_size, test_data_size)

50000 10000


## 2.用dataloader加载数据集

In [2]:
import torch

train_dataloader = torch.utils.data.DataLoader(dataset=train_data, batch_size=64)
test_dataloader = torch.utils.data.DataLoader(dataset=test_data, batch_size=64)

## 3.搭建神经网络

In [3]:
#此处类定义代码可以放在另一个文件中
class Tudui(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=5, padding=2), #这里一定记得加逗号
            torch.nn.MaxPool2d(kernel_size=2, stride=2),
            torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5, padding=2),
            torch.nn.MaxPool2d(kernel_size=2, stride=2),
            torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=2),
            torch.nn.MaxPool2d(kernel_size=2, stride=2),
            torch.nn.Flatten(),
            torch.nn.Linear(in_features=1024, out_features=10)
        )
    def forward(self, x):
        return self.model(x)

In [40]:
#检验网络创建的正确性，通过输入输出的数据shape判断
tudui = Tudui()
input = torch.ones((64, 3, 32, 32))
output = tudui(input)
output.shape

torch.Size([64, 10])

## 4.创建损失函数与优化器

In [41]:
#损失函数
loss_fn = torch.nn.CrossEntropyLoss()
#优化器
learning_rate = 0.001#1e-2
optimizer = torch.optim.SGD(params=tudui.parameters(), lr=learning_rate)

## 5.设置训练网络参数

In [None]:
from torch.utils.tensorboard import SummaryWriter

#训练次数
total_train_step = 0
#测试次数
total_test_step = 0
#训练轮数
epoch = 10


writer = SummaryWriter('../logs')

for i in range(epoch):
    epoch_loss = 0
    for data in train_dataloader:
        inputs, targets = data
        outputs = tudui(inputs)
        optimizer.zero_grad()
        result_loss = loss_fn(outputs, targets)
        result_loss.backward()
        optimizer.step()
        epoch_loss += result_loss
        total_train_step += 1
        if total_train_step % 100 == 0:
            writer.add_scalar("train_loss", result_loss, total_train_step)
    print("第{0}轮，第{1}步，loss为{2}".format(i, total_train_step, epoch_loss))
    
    #测试
    total_test_loss = 0
    with torch.no_grad():
        for data in test_dataloader:
            img, targets = data
            outputs = tudui(img)
            total_test_loss += loss_fn(outputs, targets)
    print("测试集上的loss为{0}".format(total_test_loss))
    writer.add_scalar('test_loss', total_test_loss, total_test_step)
    total_test_step += 1

writer.close()

## 6.显示正确率
用tensor数据的argmax方法可以得到tensor某一维度数据的最大值

In [16]:
output = torch.tensor([[0.1, 0.0], 
                       [0.3, 0.4]])
print(output.argmax())#flatten后的
print(output.argmax(0))#列
print(output.argmax(1))#行

tensor(3)
tensor([1, 1])
tensor([0, 1])


In [None]:
total_train_step = 0
total_test_step = 0
epoch = 10

#训练
tudui.train()#对部分层有作用，加上即可
for i in range(epoch):
    epoch_loss = 0
    epoch_train_step = 0
    epoch_correct_step = 0
    for data in train_dataloader:
        inputs, targets = data
        outputs = tudui(inputs)
        optimizer.zero_grad()
        result_loss = loss_fn(outputs, targets)

        # print(outputs.argmax(1))
        for j in range(len(targets)):
            if outputs.argmax(1)[j].item() == targets[j].item():
                epoch_correct_step += 1
            epoch_train_step += 1

        result_loss.backward()
        optimizer.step()
        epoch_loss += result_loss
        total_train_step += 1
    print("第{0}轮，第{1}步，loss为{2}，正确率为{3}".format(i, total_train_step, epoch_loss, epoch_correct_step / epoch_train_step))
    
    #测试
    tudui.eval()
    total_test_loss = 0
    epoch_test_step = 0
    epoch_test_correct = 0
    with torch.no_grad():
        for data in test_dataloader:
            img, targets = data
            outputs = tudui(img)
            total_test_loss += loss_fn(outputs, targets)

            for j in range(len(targets)):
                if outputs.argmax(1)[j].item() == targets[j].item():
                    epoch_test_correct += 1
                epoch_test_step += 1
    print("测试集上的loss为{0}，正确率为{1}".format(total_test_loss, epoch_test_correct / epoch_test_step))
    total_test_step += 1

## 7.保存模型

In [44]:
#保存网络结构和权重
torch.save(tudui, "tudui_{0}.pth".format(epoch))

#仅保留权重
dic = tudui.state_dict()
torch.save(dic, "tudui_dic_{0}".format(epoch))
print("模型已保存")

模型已保存
