In [9]:
import torch
from torch import nn
from torch.autograd import Variable
import torchvision.datasets as dsets
import torch.utils.data as Data
import matplotlib.pyplot as plt
import torchvision as tv

In [3]:
# 设置一个随机数,使我们可以每次均获得相同的随机数
torch.manual_seed(1)
print(torch.rand(1))

tensor([0.7576])


# 设置超参数

In [5]:
EPOCH = 1
BATCH_SIZE = 64
TIME_STEP = 28
INPUT_SIZE = 28
LR = 0.01
DOWNLOAD_MNIST = False

# Get the train set of the minist data

In [6]:
train_data = dsets.MNIST(
    root='./data/',
    train = True,
    transform = torchvision.transforms.ToTensor(),
    download = DOWNLOAD_MNIST,
)
train_data

Dataset MNIST
    Number of datapoints: 60000
    Root location: ./data/
    Split: Train
    StandardTransform
Transform: ToTensor()

In [7]:
# define the batch processing data of the train data
trainloader = torch.utils.data.DataLoader(
    train_data,
    batch_size = BATCH_SIZE,
    shuffle = True,
    )

# Get the test set of the minist data

In [10]:
# define test set
testset = tv.datasets.MNIST(
    root='./data/',
    train=False,
    download=DOWNLOAD_MNIST ,
    transform=torchvision.transforms.ToTensor())
testset[1][0].size()

torch.Size([1, 28, 28])

In [11]:
# define the batch processing dataset
testloader = torch.utils.data.DataLoader(
    testset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    )

In [12]:
# show the function of the dataloader, we can understand that  every batch return the inputs and the labels. 
# The size of inputs is [64, 1, 28, 28], that means we have 64 samples, every sample is a figure, and it is [1 , 28 , 28].
# In the meanwhile, we get the labels that are a 1 demension tensor including 64 elements.
for data in testloader:
    images , labels = data
    print("the images are: " , images.size())
    print("the labels are: " , labels.size())
    break

the images are:  torch.Size([64, 1, 28, 28])
the labels are:  torch.Size([64])


# define the LSTM Model

**下面cell中定义的这个LSTM的参数的一些解释：**
- input_size为28，意思是输入的特征维数是28；
- hidden_size为64，意思是输出的特征维数是64；
- num_layers为1，意思是隐藏层只有1层；
- batch_first为True,意思是输入输出的第0个维度是batch_size.
- input是一个三维的tensor,各个维度的意思分别是：

    输入数据格式：
    
    input(seq_len, batch, input_size)
    
    h0(num_layers * num_directions, batch, hidden_size)
    
    c0(num_layers * num_directions, batch, hidden_size)
    
- output是一个三维的tensor，各个维度的意思分别是：

    输出数据格式：
    
    output(seq_len, batch, hidden_size * num_directions)
    
    hn(num_layers * num_directions, batch, hidden_size)
    
    cn(num_layers * num_directions, batch, hidden_size)
    
*在我们这个例子中，input's size is $64*28*28$，也就是说一个batch有64个samples，序列长度是28，当然序列长度其实是可变的，不一定是固定的，这也是为什么模型中没有给出seq_len这个参数的原因，因为它是可变的。最后输入的特征维数是28，也就是说把一张图片的每一行像素作为一个序列的一个单元。output's size is $64*28*64$,也就是说一个batch有64个samples，序列长度是28，输出特征维数28.*
**之后在本模型中还加了一个仿射层。**仿射层前将每个output的size从$64*28*64$变成了$64*64$，然后经过放射变换后就变成了$64*10$,64就是一个batch有64个样本的意思，10代表的意思是0,1,2,3，···，9各个数字的概率，取结果为最大概率的那个数字就是最后的结果了。
我觉得取序列的任何一个理论上都可以的，取最后一个得到的准确率为95%，取序列的第一个准确率为11%，所以事实上只能取序列的最后一个，取序列的前面是会出现问题的。这是因为该模型实际上需要输入的是一个序列数据，在此我们便是把一张图片看成是一个序列，具体来说是每一行作为一个时刻，每一个时刻就是一行的28个像素，所以输入特征的维数是28，序列长度也是28，当然这是可以改变的，序列长度不一定要定长。所以自然应当是取序列中的最后一个向量作为放射层的输入。然后用双向的LSTM进行了实验，准确率有略微的上升。

In [96]:
# define the RNN model 
class RNN(nn.Module):
    def __init__(self):  # 定义一些需要的参数或者说是组件
        super(RNN , self).__init__()
 
        self.rnn = nn.LSTM(  # 使用nn.Module中自带的LSTM模型进行分析
            input_size = 28,  # 输入特征的维度是28
            hidden_size = 64,  # 输出特征的维度是64
            num_layers = 1,  # 隐藏层层数是1
            batch_first = True,  # 第一个参数是batch
        ) # rnn的参数里没有指定seq_len,因为seq_len是可变的。
 
        self.out = nn.Linear(64,10)
 
    def forward(self,x):  # 将init中定义的组件组合起来形成LSTM，并进行前向传播计算，反向传播会自动进行的不需要定义这个function.
        r_out, (h_n, h_c) = self.rnn(x, None)
#         print(1 , r_out.size())
#         print(2 , r_out[:,-1,:].size())
        out = self.out(r_out[:,-1,:])  # 只取序列中的最后一个进行分析。 
        return out

In [97]:
# help you understand the meaning of out = self.out(r_out[:,-1,:]) 
a = torch.rand((2 , 3 , 3))
print(a)
print(a[:,-1,:])
print(a[-1,:,:])

tensor([[[0.7629, 0.7097, 0.1461],
         [0.6418, 0.7792, 0.0941],
         [0.0923, 0.4499, 0.2842]],

        [[0.0705, 0.5491, 0.9103],
         [0.7107, 0.6511, 0.5376],
         [0.2022, 0.0587, 0.8673]]])
tensor([[0.0923, 0.4499, 0.2842],
        [0.2022, 0.0587, 0.8673]])
tensor([[0.0705, 0.5491, 0.9103],
        [0.7107, 0.6511, 0.5376],
        [0.2022, 0.0587, 0.8673]])


**see the defined LSTM model**

In [98]:
rnn = RNN()
print(rnn)

RNN(
  (rnn): LSTM(28, 64, batch_first=True)
  (out): Linear(in_features=64, out_features=10, bias=True)
)


# Define the optimizer and loss function

In [99]:
optimizer = torch.optim.Adam(rnn.parameters(),lr=LR)
loss_func = nn.CrossEntropyLoss()

# train the LSTM model, and test it 

In [100]:
for epoch in range(EPOCH):
    rnn.train()  #  set the net as the train mode, which means the batch layer's parameters are stable and drop out is unworking!  
    for step,(x,y) in enumerate(trainloader):
        b_x = Variable(x.view(-1,28,28))
#         print(b_x.size())
        b_y = Variable(y)
 
        optimizer.zero_grad()  # Gradient clearing
        output = rnn(b_x) # 前向传播
        loss = loss_func(output,b_y)  # 计算loss function
        
        loss.backward() # backward propagation, prepare to update the parameters
        optimizer.step()  # update the parameters of the model.
 
    rnn.eval()  # set the model as the test mode.
    # if one epoch has been over, then test the accuaccy, and print it.
    with torch.no_grad(): # do not need calculate the gradient
        correct = 0.0
        total = 0.0
        for data in testloader:
            images , labels = data
            images = Variable(images.view(-1,28,28))
#             images , labels = images.to(device) , labels.to(device)
            outputs = rnn(images)
            # set the input as the label that get the highest score
            _ , predicted = torch.max(outputs.data , 1)  # return the max value of one row and the index of the max value
            total = total + labels.size(0)
            correct += (predicted == labels).sum()
        print("The accurancy of {}th epoch is : {:.3f}%".format(epoch + 1 , correct / total * 100))

        '''
        if step % 50 == 0:
            test_output = rnn(test_x.view(-1,28,28))
            pred_y = torch.max(test_output,1)[1].data.numpy().squeeze()
            accuracy = sum(pred_y == test_y)/float(test_y.size(0))
            print('Epoch: ',epoch, '| train loss:%.4f' %loss.data[0],'| test accuracy:%.2f' %accuracy)
        '''

The accurancy of 1th epoch is : 95.210%


# Understanding the LSTM more, know more details.

In [34]:
for data in testloader:
    images , labels = data
    images = Variable(images.view(-1,28,28))
    break
print(images.size())
print(labels.size())
output = rnn(images)
print(output.size())
print(output)
labels

torch.Size([64, 28, 28])
torch.Size([64])
torch.Size([64, 10])
tensor([[-3.3535e+00,  1.1583e+00, -6.6715e-01,  1.3278e+00, -6.2588e-01,
         -1.9954e+00, -5.5185e+00,  8.7044e+00, -3.7987e+00,  2.1977e+00],
        [-3.6073e+00,  8.6772e-01,  1.0681e+01, -1.1031e+00, -2.0871e+00,
         -4.5243e-01,  5.9701e-02,  7.8678e-01,  2.5038e-01, -4.5078e+00],
        [-3.7639e+00,  1.1737e+01, -3.7288e-01, -1.7628e+00,  7.2247e-03,
         -3.6377e-01, -2.3686e+00, -1.0372e+00, -1.1450e+00, -2.9703e+00],
        [ 5.6501e+00, -2.9123e+00, -7.8421e-01, -2.1026e+00, -1.7124e+00,
         -2.7104e-02,  1.9857e+00, -3.8162e+00,  3.7686e-01,  4.9845e-01],
        [-3.0889e+00, -2.2300e+00, -1.2888e+00, -4.0178e+00,  9.7358e+00,
         -2.7764e+00, -9.8048e-01, -2.1520e-01, -1.7955e+00,  9.1425e-01],
        [-3.5080e+00,  1.1271e+01, -1.2221e+00, -1.5767e+00,  8.5013e-02,
         -5.6014e-01, -3.3541e+00, -1.1469e-01, -1.6149e+00, -2.1192e+00],
        [-2.4550e+00, -1.9412e+00, -9.8498e

tensor([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4, 9, 6, 6, 5,
        4, 0, 7, 4, 0, 1, 3, 1, 3, 4, 7, 2, 7, 1, 2, 1, 1, 7, 4, 2, 3, 5, 1, 2,
        4, 4, 6, 3, 5, 5, 6, 0, 4, 1, 9, 5, 7, 8, 9, 3])

# Try the Bidirectional RNN

试一试双向RNN做手写数字识别的效果，看会不会准确率进一步上升。

In [101]:
# define the RNN model 
class BiRNN(nn.Module):
    def __init__(self):  # 定义一些需要的参数或者说是组件
        super(BiRNN , self).__init__()
 
        self.birnn = nn.LSTM(  # 使用nn.Module中自带的LSTM模型进行分析
            input_size = 28,  # 输入特征的维度是28
            hidden_size = 64,  # 输出特征的维度是64
            num_layers = 1,  # 隐藏层层数是1
            batch_first = True,  # 第一个参数是batch
            bidirectional=True  # 双向RNN
        ) # rnn的参数里没有指定seq_len,因为seq_len是可变的。
 
        self.out = nn.Linear(128,10)
 
    def forward(self,x):  # 将init中定义的组件组合起来形成LSTM，并进行前向传播计算，反向传播会自动进行的不需要定义这个function.
        r_out, (h_n, h_c) = self.birnn(x, None)
#         print(1 , r_out.size())
#         print(2 , r_out[:,-1,:].size())
        out = self.out(r_out[:,-1,:])  # 只取序列中的最后一个进行分析。 
        return out

In [102]:
birnn = BiRNN()
print(birnn)

BiRNN(
  (birnn): LSTM(28, 64, batch_first=True, bidirectional=True)
  (out): Linear(in_features=128, out_features=10, bias=True)
)


In [103]:
optimizer = torch.optim.Adam(birnn.parameters(),lr=LR)
loss_func = nn.CrossEntropyLoss()

In [105]:
for epoch in range(EPOCH):
    birnn.train()  #  set the net as the train mode, which means the batch layer's parameters are stable and drop out is unworking!  
    for step,(x,y) in enumerate(trainloader):
        b_x = Variable(x.view(-1,28,28))
#         print(b_x.size())
        b_y = Variable(y)
 
        optimizer.zero_grad()  # Gradient clearing
        output = birnn(b_x) # 前向传播
        loss = loss_func(output,b_y)  # 计算loss function
        
        loss.backward() # backward propagation, prepare to update the parameters
        optimizer.step()  # update the parameters of the model.
 
    birnn.eval()  # set the model as the test mode.
    # if one epoch has been over, then test the accuaccy, and print it.
    with torch.no_grad(): # do not need calculate the gradient
        correct = 0.0
        total = 0.0
        for data in testloader:
            images , labels = data
            images = Variable(images.view(-1,28,28))
#             images , labels = images.to(device) , labels.to(device)
            outputs = birnn(images)
            # set the input as the label that get the highest score
            _ , predicted = torch.max(outputs.data , 1)  # return the max value of one row and the index of the max value
            total = total + labels.size(0)
            correct += (predicted == labels).sum()
        print("The accurancy of {}th epoch is : {:.3f}%".format(epoch + 1 , correct / total * 100))

        '''
        if step % 50 == 0:
            test_output = rnn(test_x.view(-1,28,28))
            pred_y = torch.max(test_output,1)[1].data.numpy().squeeze()
            accuracy = sum(pred_y == test_y)/float(test_y.size(0))
            print('Epoch: ',epoch, '| train loss:%.4f' %loss.data[0],'| test accuracy:%.2f' %accuracy)
        '''

The accurancy of 1th epoch is : 97.060%


使用双向LSTM确实使得准确率有进一步的提升，基本上是从95%提升到了96%，这是因为双向的RNN不仅可以考虑到过去时刻的信息，还可以考虑到之后时刻的信息。不过在图像识别中这种提升不明显。