In [1]:
import numpy as np
import torch
import torchvision

import torchvision
import torchvision.transforms as transforms

## 获取和读取数据
这里继续使用Fashion-MNIST数据集。我们将使用多层感知机对图像进行分类。

In [2]:
batch_size = 256

mnist_train = torchvision.datasets.FashionMNIST(root='~/Datasets/FashionMNIST', train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='~/Datasets/FashionMNIST', train=False, download=True, transform=transforms.ToTensor())

train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False)

In [3]:
print(len(mnist_train))

60000


In [8]:
features,labels = mnist_train[0]
features.size(),labels

(torch.Size([1, 28, 28]), 9)

## 定义模型参数
Fashion-MNIST数据集中图像形状为 28×28
28×28，类别数为10。本节中我们依然使用长度为 28×28=784
28×28=784 的向量表示每一张图像。因此，输入个数为784，输出个数为10。实验中，我们设超参数隐藏单元个数为256。

In [9]:
num_inputs,num_outputs,num_hiddens = 784,10,256

W1 = torch.tensor(np.random.normal(0,0.01,(num_inputs,num_hiddens)),dtype=torch.float)
b1 = torch.zeros(num_hiddens,dtype=torch.float)
W2 = torch.tensor(np.random.normal(0,0.01,(num_hiddens,num_outputs)),dtype=torch.float)
b2 = torch.zeros(num_outputs,dtype=torch.float)

params = [W1,b1,W2,b2]
for param in params:
    param.requires_grad_(requires_grad=True)

## 定义激活函数
这里我们使用基础的`max`函数来实现ReLU，而非直接调用`relu`函数。

In [11]:
def relu(X):
    return torch.max(input=X,other=torch.tensor(0.0))

## 定义模型
同softmax回归一样，我们通过`view`函数将每张原始图像改成长度为`num_inputs`的向量。然后我们实现上一节中多层感知机的计算表达式。

In [17]:
def net(X):
    X = X.view((-1,num_inputs))
    H = relu(torch.matmul(X,W1)+b1)
    return torch.matmul(H,W2) + b2

## 定义损失函数
为了得到更好的数值稳定性，我们直接使用PyTorch提供的包括softmax运算和交叉熵损失计算的函数。

In [13]:
loss = torch.nn.CrossEntropyLoss()

In [14]:
'''计算分类准确率'''
def accuracy(y_hat,y):
    return (y_hat.argmax(dim=1)==y).float().mean().item()

'''评价模型net在数据集data_iter上的准确率'''
def evaluate_accuracy(data_iter,net):
    acc_sum,n = 0.0,0
    for X,y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n
'''定义优化算法'''
def sgd(params, lr, batch_size):
    for param in params:
        param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data

## 训练模型
我们在这里设超参数迭代周期数为5，学习率为100.0。

In [19]:
num_epochs, lr = 5, 100

def train_(net,train_iter,test_iter,loss,num_epochs,batch_size,
                  params = None, lr = None, optimizer = None):
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n = .0,.0,0
        for X,y in train_iter:
            y_hat = net(X)
            # X: 256 * 784
            # y_hat: 256 * 10
            # y: 256 * 1
            l = loss(y_hat,y).sum()
            
            # 梯度清零
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            
            l.backward()
            if optimizer is None:
                sgd(params,lr,batch_size)
            else:
                optimizer.step()
        
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1)==y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print("epoch %d,loss %.4f,train_acc %.3f,test acc %.3f"
              %(epoch+1,train_l_sum / n,train_acc_sum / n,test_acc))

train_(net,train_iter,test_iter,loss,num_epochs,batch_size,
              params,lr)

epoch 1,loss 0.0030,train_acc 0.712,test acc 0.741
epoch 2,loss 0.0019,train_acc 0.823,test acc 0.793
epoch 3,loss 0.0017,train_acc 0.844,test acc 0.770
epoch 4,loss 0.0016,train_acc 0.855,test acc 0.821
epoch 5,loss 0.0014,train_acc 0.865,test acc 0.834
