In [1]:
# trainCIFAR.py
from asyncore import write
import torchvision
from torch.utils.data import DataLoader
import torch
from torch import nn
import sys
import os
curPath = os.path.abspath(os.path.dirname("modelCIFAR.py"))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
from modelCIFAR import Net
from torch.utils.tensorboard import SummaryWriter  # 关于tensorboard的使用

# 训练数据集合测试数据集的准备
train_data = torchvision.datasets.CIFAR10(root = "../data", train=True, 
transform=torchvision.transforms.ToTensor(), download=True) # 注意要将PIL图像转换成向量

test_data = torchvision.datasets.CIFAR10(root="../data", train = False, 
transform=torchvision.transforms.ToTensor(), download=True)

# 查看训练数据集合测试数据集的长度
train_data_size = len(train_data)
test_data_size = len(test_data)

# format字符串格式化，format中的变量替换{}
print("训练数据集的长度为:{}".format(train_data_size))
print("训练数据集的长度为:{}".format(test_data_size))

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:28<00:00, 5885766.06it/s]


Extracting ../data/cifar-10-python.tar.gz to ../data
Files already downloaded and verified
训练数据集的长度为:50000
训练数据集的长度为:10000


In [2]:
# 利用Dataloader来加载数据集
train_dataloader = DataLoader(train_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

In [4]:
# 搭建神经网络(10分类)
net = Net()

# 创建交叉熵损失函数
loss_fn = nn.CrossEntropyLoss()

# 优化器 设置优化的参数和学习速率
learning_rate = 1e-2   # 0.01
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)


# 设置训练网络的参数
# 记录训练的次数
total_train_step = 0
# 记录测试的次数
total_test_step = 0
# 训练的轮数
epoch = 10

In [5]:
# 添加tensorboard
writer = SummaryWriter("../logs_train")

for i in range(epoch):
    print("-----第{}轮训练开始-----".format(i+1)) # i从0 - 9 

    # 训练步骤开始
    for data in train_dataloader:
        imgs, targets = data
        outputs = net(imgs)
        loss = loss_fn(outputs, targets)

        # 优化器优化模型
        optimizer.zero_grad() # 用优化器清零梯度
        loss.backward()  # 得到每个参数节点的梯度
        optimizer.step()

        total_train_step = total_train_step+1  # 训练完一次

        if total_train_step % 100 == 0:
            print("训练次数:{}, loss: {}".format(total_train_step, loss.item())) # item将tensor数据类型转换成数字
            writer.add_scalar("train_loss", loss.item(), total_train_step)

    # 每次训练完一轮以后，在测试数据集上跑一遍，用测试数据集上的损失或者叫正确率来评估有没有训练好
    total_test_loss = 0
    total_accuracy = 0
    with torch.no_grad(): #没有梯度。不会调优 
        for data in test_dataloader:  # data的一部分数据在网络模型上的损失
            imgs, targets = data
            outputs = net(imgs)
            loss = loss_fn(outputs, targets)
            total_test_loss += loss.item()  # loss是一个张量转换成item标量
            accuracy = (outputs.argmax(1) == targets).sum()
            total_accuracy += accuracy

    print("整体测试集上的loss: {}".format(total_test_loss))  
    print("整体测试集上的正确率: {}".format(total_accuracy / test_data_size))

    writer.add_scalar("test_loss", total_test_loss, total_test_step)
    writer.add_scalar("test_accuracy", total_accuracy/test_data_size, total_test_step)
    total_test_step += 1  # 测试完一次

    # torch.save(net, "Net_{}.pth".format(i))
    # 官方推荐的保存模型
    torch.save(net.state_dict(), "Net_{}.pth".format(i))
    print("模型已保存")


writer.close()

-----第1轮训练开始-----
训练次数:100, loss: 2.2908287048339844
训练次数:200, loss: 2.283921003341675
训练次数:300, loss: 2.274627685546875
训练次数:400, loss: 2.231511116027832
训练次数:500, loss: 2.0906808376312256
训练次数:600, loss: 2.0477702617645264
训练次数:700, loss: 2.013421058654785
整体测试集上的loss: 316.7603305578232
整体测试集上的正确率: 0.2694999873638153
模型已保存
-----第2轮训练开始-----




训练次数:800, loss: 1.910315990447998
训练次数:900, loss: 1.8641213178634644
训练次数:1000, loss: 1.9227256774902344
训练次数:1100, loss: 1.9627751111984253
训练次数:1200, loss: 1.7137823104858398
训练次数:1300, loss: 1.6793209314346313
训练次数:1400, loss: 1.7559492588043213
训练次数:1500, loss: 1.8092200756072998
整体测试集上的loss: 303.2302030324936
整体测试集上的正确率: 0.31189998984336853
模型已保存
-----第3轮训练开始-----
训练次数:1600, loss: 1.74198317527771
训练次数:1700, loss: 1.6731727123260498
训练次数:1800, loss: 1.9179251194000244
训练次数:1900, loss: 1.7241748571395874
训练次数:2000, loss: 1.894544005393982
训练次数:2100, loss: 1.525009274482727
训练次数:2200, loss: 1.4688990116119385
训练次数:2300, loss: 1.7759449481964111
整体测试集上的loss: 272.5285631418228
整体测试集上的正确率: 0.3725999891757965
模型已保存
-----第4轮训练开始-----
训练次数:2400, loss: 1.7314164638519287
训练次数:2500, loss: 1.3273792266845703
训练次数:2600, loss: 1.59180748462677
训练次数:2700, loss: 1.6896920204162598
训练次数:2800, loss: 1.4609264135360718
训练次数:2900, loss: 1.5987234115600586
训练次数:3000, loss: 1.323833703994751
训练次数:3100


假设是一个二分类问题
输入是两张图片
2 x input
输出
[0.1, 0.2]
[0.3, 0.4]
预测
pred =  [1]
        [1]
通过argmax将输出变成 pred

当合真实的y值比较 Inputs target = [0, 1]
通过Preds == inputs target 判断对应位置是否相等
[false, true].sum() = 1

之所以没有在训练的时候同model.train()
在测试的时候用model.eval()
是因为这两个只对网络中特定的层起作用如dropout层、batchnorm层
但是net并没有
