# LSTM and GRU

## Mingcan Yuan (my463)

### Import Data and Package

In [3]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

### Field decides how our data should be prossessed. The 'TEXT'  field handles the review
### and the "LABEL" field handles the sentiment.
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

### Automatically download the IMDb dataset and split it into the canonical train/test splits
### as "torchtext.dataset" objects
train, test = datasets.IMDB.splits(TEXT, LABEL)

### Further split the training set to training set and validation set. This split is a 70/30 split.
train, valid = train.split(random_state=random.seed(SEED))

In [4]:
### Construct the vectors to represent each vocabularies.
### Here, we use pre-trained word embeddings. These vectors have been trained on corpuses of billions of tokens. 
### Our word embeddings are initialized with these pre-trained vectors, where words that appear in similar contexts appear nearby in this vector space.
### The "glove" is the algorithm used to calculate the vectors.
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [5]:
BATCH_SIZE = 64

### This step we create the iterators. "BucketIterator" firstly sorts the example
### by the length of the sentences, then partitions them into buckets.
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

### Define Models

In [6]:
import torch.nn as nn

class RNN_LSTM(nn.Module):
    
    '''
    ### Within the "__init__" we define the layers of the module. The layers include embedding layer, RNN, and a linear layer.
    ### The embedding layer is used to transform our vectors, which denote vocabularies, into a dense embedding vector.
    ### The RNN layer is our RNN which takes in ourse dense vector and the previous hidden state to calculate the next hidden state.
    ### The linear layer takes the final hidden state and feeds it through a fully connected layer, transforming
    ### it to the correct output dimension.
    
    ### Implementing bidirectionality and adding additional layers are done by passing values for the "num_layers" and "bidirectional"
    ### arguments for the LSTM/GRU.
    
    ### Dropout is implemented by initializing an nn.Dropout layer (the argument is the probability of dropout for each neuron)
    ### and using it within the forward method after each layer we want to apply to dropout to.
    '''
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    '''
    ### The "forward" method is called when we feed examples into the model.
    ### Firstly, the input batch is passed through the embedding layer to get embedded.
    ### Secondly, the "embedded" is fed into the RNN.
    ### Then we get the return of RNN layer. "Output" is the concatenation of the hidden
    ### state from every step, whereas "hidden" is the final hidden state.
    '''
    def forward(self, x):
        
        #x = [sentence length, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sentence length, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))
    
    
class RNN_GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]22222
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [7]:
### Create the instances of model_LSTM class and model_GRU class 

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model_LSTM = RNN_LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model_GRU = RNN_GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

### Train the Models

In [8]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [9]:
### First, we create an optimizer. This is the algorithm we use to ipdate the parameters 
### of the module. Here, we will use Adam algorithm.
import torch.optim as optim

optimizer_LSTM = optim.Adam(model_LSTM.parameters())
optimizer_GRU = optim.Adam(model_GRU.parameters())

In [10]:
### We define the loss function, which is "binary cross entropy with logits"
criterion = nn.BCEWithLogitsLoss()

### If PyTorch detects a GPU, we can place the model and the criterion on the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_LSTM = model_LSTM.to(device)
model_GRU = model_GRU.to(device)
criterion = criterion.to(device)

In [11]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [12]:
def train(model, iterator, optimizer, criterion):
    
    '''
    The 'train' function iterates over all examples, a batch at a time.
    ’model.train()‘ is used to put the model in "training mode".
    
    For each batch, we first zero the gradients. Then we feed the batch of
    sentences "batch.text" into the model, calculate the loss and accuracy, and
    finally calculate the gradient of each parameter with "loss.backward()"
    and update the parameters using the gradients and optimizer algorithm with
    "optimizer.step()"
    
    The final step, we return the loss and accuracy, averaged across the epoch.
    '''
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [13]:
def evaluate(model, iterator, criterion):
    
    '''
    This function is similiar to 'train', with a few modifications as we don't want to update
    the parameters.
    '''
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Test Models and Evaluate Models

In [14]:

N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    
    
    ### We train the model through multiple epochs, an epoch being a complete pass through all examples in the split.
    
    
    train_loss, train_acc = train(model_LSTM, train_iterator, optimizer_LSTM, criterion)
    valid_loss, valid_acc = evaluate(model_LSTM, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.693, Train Acc: 51.62%, Val. Loss: 0.691, Val. Acc: 51.56%
Epoch: 02, Train Loss: 0.692, Train Acc: 51.78%, Val. Loss: 0.695, Val. Acc: 53.80%
Epoch: 03, Train Loss: 0.666, Train Acc: 59.52%, Val. Loss: 0.615, Val. Acc: 66.54%
Epoch: 04, Train Loss: 0.643, Train Acc: 62.35%, Val. Loss: 0.670, Val. Acc: 59.20%
Epoch: 05, Train Loss: 0.567, Train Acc: 70.73%, Val. Loss: 0.475, Val. Acc: 77.77%


In [15]:
#### Calculate the accuracy of the model in the test set.

test_loss, test_acc = evaluate(model_LSTM, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.486, Test Acc: 76.80%


In [16]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model_GRU, train_iterator, optimizer_GRU, criterion)
    valid_loss, valid_acc = evaluate(model_GRU, valid_iterator, criterion)
    torch.cuda.empty_cache()
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.692, Train Acc: 54.51%, Val. Loss: 0.682, Val. Acc: 58.32%
Epoch: 02, Train Loss: 0.678, Train Acc: 56.47%, Val. Loss: 0.683, Val. Acc: 57.29%
Epoch: 03, Train Loss: 0.593, Train Acc: 67.28%, Val. Loss: 0.454, Val. Acc: 79.18%
Epoch: 04, Train Loss: 0.408, Train Acc: 81.84%, Val. Loss: 0.398, Val. Acc: 82.38%
Epoch: 05, Train Loss: 0.336, Train Acc: 85.40%, Val. Loss: 0.332, Val. Acc: 85.87%


In [18]:
test_loss, test_acc = evaluate(model_GRU, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.384, Test Acc: 84.02%


### Sum up

Based on the results of LSTM and GRU, we can get the conclusion that accuracy from GRU is higher and loss from GUR is lower, which means GRU is better for this data.