In [97]:
import torch
from torch import nn
from net import MyLeNet
# lr_scheduler: 提供根据epoch训练次数来调整学习率的方法
from torch.optim import lr_scheduler
import torchvision  
from torch.utils.data import  DataLoader
# transforms 主要用于一些常用的变换
import torchvision.transforms as transforms
import os

In [98]:
# 数据转换为tensor格式
# Compose(): 将多个transforms的操作整合在一起
# Totensor(): 将numpy()的ndarray或PIL.Image读的图片转换为(C,H,W)的Tensor格式，且归一化到(0,1.0)

transform = transforms.Compose([
    transforms.ToTensor(),  
])

train_dataset = torchvision.datasets.MNIST(root='./data/train', train=True,download=True,transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=16,shuffle=True)

test_dataset = torchvision.datasets.MNIST(root='./data/test', train=False,download=True,transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=16,shuffle=True)

In [99]:
# 如果有英伟达显卡， 转到GPU上训练， 否则用CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(device) cuda

# 将模型转到device上
model = MyLeNet().to(device)

In [100]:
# 定义损失函数（交叉熵损失函数）
loss_fn = nn.CrossEntropyLoss()

# 定义优化器（随机梯度下降）
optimizer = torch.optim.SGD(model.parameters(),lr=1e-3,momentum=0.9)

# 定义lr_scheduler
# StepLR：用于调整学习率， 一般会随着epoch的增加而减小学习率
# 每10个epoch，学习率变为原来的0.1
lr_scheduler = lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1)

In [101]:
# 定义训练函数
def train(dataloader,model,loss_fn,optimizer):
    loss, current, n = 0.0 ,0.0, 0
    
    for batch, (x,y) in enumerate(dataloader):
        # 将数据转到device上
        x,y = x.to(device),y.to(device)
        
        # 前向传播
        output = model(x)

        # 计算观测值与训练值之间的损失函数
        cur_loss = loss_fn(output,y)

        # torch.max(input,dim)函数
        # input为具体的tensor，dim是max函数索引的维度，0是每列的最大值，1是每行的最大值
        # 函数会返回两个tensor，第一个tensor是每行的最大值，第二个tensor是每行最大值的索引
        _, pred = torch.max(output, axis=1) 

        cur_acc = torch.sum(y == pred) / output.shape[0]

        # 反向传播
        # 清空过往梯度
        optimizer.zero_grad()
        # 反向传播,计算当前梯度
        cur_loss.backward()
        # 根据当前梯度优化参数
        optimizer.step()
        # .item():取出单元素的元素值，将其返回
        loss += cur_loss
        current += cur_acc
        n += 1

    train_loss = (loss / n)
    train_acc = current / n
    print('train_loss: ' + str(train_loss.item()))
    print('train_acc ' +  str(train_acc.item()))



In [102]:
# 定义验证函数
def val(dataloader, model, loss_fn):
    # 将模型设置为验证模式，以防改变权值
    model.eval()
    loss, current, n = 0.0, 0.0, 0

    with torch.no_grad():
        for batch, (x,y) in enumerate(dataloader):
            x,y = x.to(device),y.to(device)
            output = model(x)
            cur_loss = loss_fn(output,y)
            _, pred = torch.max(output, axis=1) 
            cur_acc = torch.sum(y == pred) / output.shape[0]
            loss += cur_loss
            current += cur_acc
            n += 1


        print('val_loss: '+str(loss.item()/n))
        print('val_acc: '+str(current.item()/n))
        # 返回模型准确率
        return current / n


In [105]:
# 开始训练
epoch = 20
max_acc = 0 # 记录当前最大的正确率，用于判断最佳模型
for t in range(epoch):
    print(f'epoch{t+1}\n----------')
    # 训练模型
    train(train_dataloader,model,loss_fn,optimizer)
    # 验证模型
    acc = val(test_dataloader,model,loss_fn)
    # 保存最好的权重模型
    if acc > max_acc:
        folder = 'save_model'
        # path.exists: 判断括号里的文件是否存在，存在为True，括号内可以是文件路径
        if not os.path.exists(folder):
            # os.mkdir(): 用于数字权限模式创建目录
            os.mkdir('save_model')
        max_acc = acc
        print('save best model')
    # torch.save(state, dir)保存模型等相关的参数，dir表示保存文件的逻辑+保存文件名
    # model.state_dict(): 返回的是一个OrderedDict，存储了网络结构的名字和对应的参数
    torch.save(model.state_dict(),'save_model/best_model.pth')
print('Done!')


epoch1
----------
train_loss: 0.05663735792040825
train_acc 0.9827333688735962
val_loss: 0.04972222900390625
val_acc: 0.9843
save best model
epoch2
----------
train_loss: 0.05509629845619202
train_acc 0.9827499985694885
val_loss: 0.04799354858398437
val_acc: 0.9843
epoch3
----------
train_loss: 0.05402019992470741
train_acc 0.9833999872207642
val_loss: 0.05158168334960937
val_acc: 0.9836
epoch4
----------
train_loss: 0.05272310599684715
train_acc 0.9837666749954224
val_loss: 0.04974171447753906
val_acc: 0.9843
epoch5
----------
train_loss: 0.051260050386190414
train_acc 0.9842666983604431
val_loss: 0.04655463256835937
val_acc: 0.9848
save best model
epoch6
----------
train_loss: 0.05036832019686699
train_acc 0.9843000173568726
val_loss: 0.044907351684570315
val_acc: 0.9848
epoch7
----------
train_loss: 0.049182452261447906
train_acc 0.9852833151817322
val_loss: 0.04811090087890625
val_acc: 0.9843
epoch8
----------
train_loss: 0.04841326177120209
train_acc 0.9853000044822693
val_loss: 0