# Homework 7 - Experiments on RNN and LSTM

Please implement the following two functions:
- MnistRNN() - Design a RNN
- MnistLSTM() - Design a LSTM 

Please train two models on the Mnist dataset and print the training results for each epoch.

In [1]:
from torchvision.datasets import MNIST
from torchvision.transforms import Compose,ToTensor,Normalize
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import os
import torch
import numpy as np

BATCH_SIZE = 128
TEST_BATCH_SIZE = 1000
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.enabled = False

# dataloader for the dataset
def get_dataloader(train,batch_size=BATCH_SIZE):
    transform_fn = Compose([
        ToTensor(),
        Normalize(mean = (0.1307,), std = (0.3081,))
        ]) 
    dataset = MNIST(root = './data',train = train,transform = transform_fn, download = True)
    data_loader = DataLoader(dataset,batch_size = batch_size,shuffle = True)
    return data_loader

In [2]:
# RNN
class MnistRNN(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=128, output_dim=10,
                 num_layers=1):
        super(MnistRNN, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.stem = nn.Sequential(
            nn.Conv2d(
                input_dim, hidden_dim,
                kernel_size=4, stride=4),
            nn.ReLU())

        # Tip: define RNN
        # pass
        self.flattened_size = hidden_dim * 7 * 7
               # Define LSTM
        
        # Fully connected layer for the output
        self.lstm = nn.LSTM(input_size=7 * 7, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        inputs = self.stem(inputs)
        inputs = inputs.view(inputs.size(0), self.hidden_dim, -1)  # B x 1 x (7x7)

        # Tip: forward RNN
        # hidden = pass
        h0 = torch.zeros(self.num_layers, 
                         inputs.size(0), 
                         self.hidden_dim).to(inputs.device)
        c0 = torch.zeros(self.num_layers, 
                         inputs.size(0), 
                         self.hidden_dim).to(inputs.device)

        # Forward propagate LSTM
        out, _ = self.lstm(inputs, (h0, c0))  
        # out: tensor of shape (batch_size, seq_len, hidden_dim)

        # Get the last time step output
        out = out[:, -1, :]

        # Pass the output through the fully connected layer
        out = self.fc(out)

        # Apply the softmax activation
        output = self.softmax(out)
        return output

In [12]:
# LSTM
class MnistLSTM(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=128, 
                 output_dim=10,  num_layers=1):
        super(MnistLSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.stem = nn.Sequential(
            nn.Conv2d(
                input_dim, hidden_dim,
                kernel_size=4, stride=4),
            nn.ReLU())

        # Tip: define LSTM (the official implementation in `nn`` can be used)
        # pass
        
        self.flattened_size = hidden_dim * 7 * 7
        # Fully connected layer for the output
        self.lstm = nn.LSTM(input_size=7 * 7, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        # self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        inputs = self.stem(inputs)
        inputs = inputs.view(inputs.size(0), -1, self.hidden_dim)  # B x (7x7) x D
        # inputs = inputs.permute(0, 2, 1) 

        # Tip: forward LSTM
        # output = pass
        h0 = torch.zeros(self.num_layers, 
                         inputs.size(0), 
                         self.hidden_dim).to(inputs.device)
        c0 = torch.zeros(self.num_layers, 
                         inputs.size(0), 
                         self.hidden_dim).to(inputs.device)

        # Forward propagate LSTM
        out, _ = self.lstm(inputs, (h0, c0))  # out: tensor of shape (batch_size, seq_len, hidden_dim)

        # Get the last time step output
        out = out[:, -1, :]

        # Pass the output through the fully connected layer
        out = self.fc(out)

        # Apply the softmax activation
        # output = self.softmax(out)
        output = F.log_softmax(out, dim=1)
        return output

## Train the RNN model

In [4]:
model = MnistRNN().to(device)
optimizer = Adam(model.parameters(), lr=0.001)

In [5]:
def train(epoch, num_epochs):
    data_loader = get_dataloader(True)
    total_step = len(data_loader)
    for idx, (input, target) in enumerate(data_loader):
        optimizer.zero_grad()
        output = model(input.to(device))
        loss = F.nll_loss(output, target.to(device))
        loss.backward()
        optimizer.step()
        if (idx+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, idx+1, total_step, loss.item()))

In [6]:
def test():
    loss_list = []
    acc_list = []
    test_dataloader = get_dataloader(train = False,batch_size=TEST_BATCH_SIZE)
    for idx,(input,target) in enumerate(test_dataloader):
        with torch.no_grad():
            output = model(input.to(device))
            target = target.to(device)
            cur_loss = F.nll_loss(output, target)
            loss_list.append(cur_loss.cpu())
            pred = output.max(dim = -1)[-1]
            cur_acc = pred.eq(target).float().mean()
            acc_list.append(cur_acc.cpu())
    print("Mean accuracy: ", np.mean(acc_list), "Mean loss: ", np.mean(loss_list))

In [7]:
test()
num_epochs = 3
for i in range(num_epochs):
    train(i, num_epochs)
test()

Mean accuracy:  0.15520002 Mean loss:  2.3003516
Epoch [1/3], Step [100/469], Loss: 0.7413
Epoch [1/3], Step [200/469], Loss: 0.3111
Epoch [1/3], Step [300/469], Loss: 0.3391
Epoch [1/3], Step [400/469], Loss: 0.2736
Epoch [2/3], Step [100/469], Loss: 0.2641
Epoch [2/3], Step [200/469], Loss: 0.1956
Epoch [2/3], Step [300/469], Loss: 0.2292
Epoch [2/3], Step [400/469], Loss: 0.1854
Epoch [3/3], Step [100/469], Loss: 0.1408
Epoch [3/3], Step [200/469], Loss: 0.1017
Epoch [3/3], Step [300/469], Loss: 0.0769
Epoch [3/3], Step [400/469], Loss: 0.1259
Mean accuracy:  0.9636 Mean loss:  0.1170838


## Train the LSTM model

In [8]:
model = MnistLSTM().to(device)
optimizer = Adam(model.parameters(), lr=0.001)

In [13]:
test()
num_epochs = 3
for i in range(num_epochs):
    train(i, num_epochs)
test()

RuntimeError: input.size(-1) must be equal to input_size. Expected 49, got 128