# Section 2: RNNs in PyTorch

## Goals
1. Build a simple RNN classifier
2. Learn about PyTorch's in-built RNN modules (LSTM etc.)

(Roughly follows http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html)

In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torchtext
from torchtext.vocab import Vectors, GloVe

### Part 1: Building an RNN sentiment classifier
#### Part 1.1: Generating the data

First we'll generate some toy data. The task will be to recall an integer at a certain position in a sequence. 
For a sequence a<sub>1</sub> a<sub>12</sub> a<sub>3</sub> a<sub>4</sub> a<sub>5</sub> the output might be a<sub>3</sub>.

In [2]:
# number of training examples
n_train = 2000

# number of validation examples
n_val = 1000

# length of each sequence
n_length = 10

# examples per batch
n_batch = 32

# size of the vocabulary
n_vocab = 20

# position to be recalled
answer_pos = n_length-1

# generate random sequences
train_seq = Variable(torch.Tensor(n_train, n_length).random_(0, n_vocab).long())
val_seq = Variable(torch.Tensor(n_val, n_length).random_(0, n_vocab).long())

# choose the correct labels
train_labels = train_seq.clone()[:, answer_pos]
val_labels = val_seq.clone()[:, answer_pos]

# group data into batches
train_iter = []
for i in range(0, n_train, n_batch):
    batch_seq = train_seq[i:i+n_batch]
    batch_labels = train_labels[i:i+n_batch]
    if (batch_seq.size()[0] == n_batch):
        train_iter.append([batch_seq, batch_labels])
    
val_iter = []
for i in range(0, n_val, n_batch):
    batch_seq = val_seq[i:i+n_batch]
    batch_labels = val_labels[i:i+n_batch]
    if (batch_seq.size()[0] == n_batch):
        val_iter.append([batch_seq, batch_labels])

#### Part 1.2 Build the model (version 1)

The RNN module will be a PyTorch model like any other, with init a forward functions. This network:
1. Takes as input the word at a particular point in the sequence, as well as the hidden state at the previous state of the network
2. Uses nn.Embedding to get a vector for the word
3. Concatenate the embedding and the hidden state
4. Apply a linear layer to get the next hidden state
5. Apply a linear layer to get the output
6. Output both 

In [38]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, vocab_size):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, input_size)
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        combined = torch.cat((embedded, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

#### Part 1.3: Train the model

Now we can initialize and train the network:

In [39]:
def train_batch(model, criterion, optim, batch, label):
    # initialize hidden vector
    hidden = Variable(torch.zeros(n_batch, n_hidden))

    # clear gradients
    rnn.zero_grad()

    # calculate forward pass
    for i in range(batch.size()[1]):
        output, hidden = model(batch[:, i], hidden)

    # calculate loss    
    loss = criterion(output, label)

    # backpropagate and step
    loss.backward()
    optim.step()
    
    return loss.data[0]

In [41]:
# training loop
def train(model, criterion, optim):
    for e in range(n_epochs):
        batches = 0
        epoch_loss = 0
        avg_loss = 0
        for batch, label in train_iter:
            batch_loss = train_batch(model, criterion, optim, batch, label)
            batches += 1
            epoch_loss += batch_loss
            avg_loss = ((avg_loss * (batches - 1)) + batch_loss) / batches
        
        print("Epoch ", e, " Loss: ", epoch_loss)
        


In [42]:
# size of the hidden vector
n_hidden = 3

# initialize the network
rnn = RNN(n_vocab, n_hidden, n_vocab, n_vocab)

n_epochs = 30
learning_rate = .05
criterion = nn.NLLLoss()
optim = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

train(rnn, criterion, optim)

TypeError: __init__() got an unexpected keyword argument 'dim'

#### Part 1.4: Test the model

Testing the model is similar to training it:

In [13]:
def test_batch(batch, label):
    if (batch.size()[0] != n_batch):
        return 0, 0
    
    # initialize hidden state
    hidden = Variable(torch.zeros(n_batch, n_hidden))
    
    # calculate forward pass
    for i in range(batch[0].size()[0]):
        output, hidden = rnn(batch[:, i], hidden)
        
    # calculate predictions
    _, pred = output.max(1)

    # calculate number of correct predictions
    correct = (pred == label).long().sum().data[0]
    return correct, n_batch

Then calculate the total score by looping through the batches:

In [15]:
# Test loop

batch_num = 0
correct = 0
total = 0
for i in range(len(val_iter)):
    batch, label = val_iter[i]
    batch_correct, batch_size = test_batch(batch, label)
    batch_num += 1
    correct += batch_correct
    total += batch_size
    
print("Fraction correct: ", correct / total)

Fraction correct:  1.0


### Part 2: Using PyTorch RNN modules

PyTorch's RNN capabilities live [here](http://pytorch.org/docs/master/nn.html#recurrent-layers). We can use it as follows (note that the input is batched along the **second** dimension):

In [16]:
n_input = 10
n_hidden = 20
n_layers = 2
n_batch = 3
n_length = 5
rnn = nn.LSTM(n_input, n_hidden, n_layers)
input = Variable(torch.randn(n_length, n_batch, n_input))
h0 = Variable(torch.randn(n_layers, n_batch, n_hidden))
c0 = Variable(torch.randn(n_layers, n_batch, n_hidden))
output, hn = rnn(input, (h0, c0))
print(output, hn)

Variable containing:
(0 ,.,.) = 

Columns 0 to 8 
   0.4608 -0.1072 -0.0000 -0.0044  0.1335  0.0743  0.2502 -0.2156  0.2602
 -0.4931 -0.2171 -0.1714 -0.5364 -0.0910  0.4206  0.1661 -0.0412 -0.3391
 -0.4461 -0.1393  0.2427 -0.1197 -0.4182  0.0320  0.2920  0.2345 -0.4173

Columns 9 to 17 
   0.1045  0.5604  0.4094 -0.1384 -0.3334 -0.0955 -0.1117 -0.0656  0.0648
 -0.3996 -0.0585  0.1020  0.1063  0.1838 -0.1989  0.1435 -0.0395  0.3219
  0.4321 -0.0071 -0.1174  0.1073 -0.4654 -0.0068  0.4993 -0.0367  0.2381

Columns 18 to 19 
  -0.0390 -0.4322
  0.1172 -0.1978
  0.1186  0.0487

(1 ,.,.) = 

Columns 0 to 8 
   0.1953  0.0525  0.1024 -0.0282 -0.0380  0.0456  0.1338 -0.1059  0.1165
 -0.2181 -0.0132 -0.0461 -0.1994 -0.0268  0.1971  0.1365  0.0736 -0.2742
 -0.1464  0.0155  0.2230 -0.0221 -0.2437  0.0604  0.1570  0.2687 -0.2530

Columns 9 to 17 
  -0.0131  0.3302  0.3256  0.0515 -0.2773 -0.1092 -0.1512 -0.1325  0.0694
 -0.2193  0.1088  0.1282  0.1127  0.0095 -0.0725 -0.0710 -0.1267  0.1606
  0.12

We'll define a custom module to apply this module to our problem. This module will embed each integer, then apply the LSTM to the sequence, and then apply a linear and a softmax to get probabilities for each class:

In [35]:
class MyLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, vocab_size, n_layers):
        super(MyLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        # embed the input integers
        embedded = self.embedding(input)
        
        # put the batch along the second dimension
        embedded = embedded.transpose(0, 1)
        
        # apply the LSTM
        output, hidden = self.lstm(embedded, hidden)
        
        # apply the linear and the softmax
        output = self.softmax(self.linear(output))

        return output, hidden

Training and testing are essentially the same as before, except that we no longer need to manually loop in the forward pass:

In [36]:
def train_batch(model, criterion, optim, batch, label):
    # initialize hidden vectors
    hidden = (Variable(torch.zeros(n_layers, n_batch, n_hidden)), Variable(torch.zeros(n_layers, n_batch, n_hidden)))

    # clear gradients
    rnn.zero_grad()

    # calculate forward pass
    output, hidden = model(batch, hidden)

    # calculate loss    
    loss = criterion(output[answer_pos], label)

    # backpropagate and step
    loss.backward()
    optim.step()
    
    return loss.data[0]

In [37]:
# training loop
def train(model, criterion, optim):
    for e in range(n_epochs):
        batches = 0
        epoch_loss = 0
        avg_loss = 0
        for batch, label in train_iter:
            batch_loss = train_batch(model, criterion, optim, batch, label)
            batches += 1
            epoch_loss += batch_loss
            avg_loss = ((avg_loss * (batches - 1)) + batch_loss) / batches
        
        print("Epoch ", e, " Loss: ", epoch_loss)
        


In [30]:
# size of the embeddings and vectors
n_embedding = 128
n_hidden = 128

# number of layers
n_layers = 1

# initialize LSTM
rnn = MyLSTM(n_embedding, n_hidden, n_vocab, n_vocab, n_layers)

n_epochs = 30
learning_rate = .1
criterion = nn.NLLLoss()
optim = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

train(rnn, criterion, optim)

RuntimeError: inconsistent tensor size, expected r_ [32 x 512], t [32 x 512] and src [3 x 512] to have the same number of elements, but got 16384, 16384 and 1536 elements respectively at d:\projects\pytorch\torch\lib\th\generic/THTensorMath.c:887

In [None]:
# Test loop

batch_num = 0
correct = 0
total = 0
for i in range(len(val_iter)):
    batch, label = val_iter[i]
    batch_correct, batch_size = test_batch(batch, label)
    batch_num += 1
    correct += batch_correct
    total += batch_size
    
print("Percent correct: ", correct / total)