# RNN Model Comparison-LSTM vs GRU

$$ Author : Xinyue \ Lyu $$
$$ Email: xl669@cornell.edu $$

# Data Preparation

In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234  # set seed

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# tokenize the text using "Spacy"
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

# split the data into train and test
train, test = datasets.IMDB.splits(TEXT, LABEL)
# further split the train into train and valid
train, valid = train.split(random_state=random.seed(SEED))

In [2]:
# use the pre-trained word embeddings
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

The glove is the algorithm used to calculate the vectors. 6B indicates these vectors were trained on 6 billion tokens. 100d indicates these vectors are 100-dimensional.

In [3]:
# 64 sentences with similar length will be returned
BATCH_SIZE = 64

# create iterators for train/valid/test
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

# LSTM Model

In [4]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        """
        Initialize the LSTM model object
        vocab_size: the dimension of the vocab vector
        embedding_dim: the dimension of the dense word vector after embedding
        hidden_dim: the dimension of the hidden layer
        output_dim: the dimension of the output 
        n_layers: the number of the layers
        bidirectional: add another layer that processes from last to first
        dropout: used to reduce overfitting
        """
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        """
        define the forwarding process between each node
        x: x is [sent len, batch size]
        """
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

# GRU Model

In [5]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        """
        Initialize the GRU model object
        vocab_size: the dimension of the vocab vector
        embedding_dim: the dimension of the dense word vector after embedding
        hidden_dim: the dimension of the hidden layer
        output_dim: the dimension of the output 
        n_layers: the number of the layers
        bidirectional: add another layer that processes from last to first
        dropout: used to reduce overfitting
        """
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        """
        define the forwarding process between each node
        x: x is [sent len, batch size]
        """
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

# Train the models

In [6]:
# initialize the parameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

# fit the LSTM and GRU models respectively with the initialized features
model_lstm = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model_gru = GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [7]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [8]:
# For LSTM model, assign the pretrained embedding weight to the embedding layer
model_lstm.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4096, -0.5753,  0.1126,  ...,  0.4092,  0.1856,  0.1066],
        [ 0.2110, -0.2472,  0.6508,  ..., -0.1627,  0.4507, -1.1627],
        [-0.2379, -0.1095,  0.4314,  ...,  0.6665,  0.3200,  0.8872]])

In [9]:
# For GRU model, assign the pretrained embedding weight to the embedding layer
model_gru.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4096, -0.5753,  0.1126,  ...,  0.4092,  0.1856,  0.1066],
        [ 0.2110, -0.2472,  0.6508,  ..., -0.1627,  0.4507, -1.1627],
        [-0.2379, -0.1095,  0.4314,  ...,  0.6665,  0.3200,  0.8872]])

In [10]:
# Create the optimizer with Adam method for LSTM and GRU
import torch.optim as optim

optimizer_lstm = optim.Adam(model_lstm.parameters())
optimizer_gru = optim.Adam(model_gru.parameters())

In [11]:
# use binomial cross entropy as the loss function 
# Sigmoid layer and the BCELoss
criterion = nn.BCEWithLogitsLoss()
# let the code run in GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_lstm = model_lstm.to(device)
model_gru = model_gru.to(device)

criterion = criterion.to(device)

In [12]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    """
    calculate the accuracy rate of prediction
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [13]:
def train(model, iterator, optimizer, criterion):
    """
    Train the model
    model: which model to be trained
    iterator: the number of iterations
    optimizer: specifies the learning rate
    criterion: criteria to evaluate loss
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
def evaluate(model, iterator, criterion):
    """
    Evaluate the model after the model is trained
    model: the name of the model
    iterator: the number of iterations
    criterion: criteria to evaluate loss
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
# train the LSTM model for five times
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss_lstm, train_acc_lstm = train(model_lstm, train_iterator, optimizer_lstm, criterion)
    valid_loss_lstm, valid_acc_lstm = evaluate(model_lstm, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_lstm:.3f}, Train Acc: {train_acc_lstm*100:.2f}%, Val. Loss: {valid_loss_lstm:.3f}, Val. Acc: {valid_acc_lstm*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.687, Train Acc: 53.82%, Val. Loss: 0.667, Val. Acc: 61.81%
Epoch: 02, Train Loss: 0.677, Train Acc: 57.08%, Val. Loss: 0.671, Val. Acc: 62.76%
Epoch: 03, Train Loss: 0.671, Train Acc: 57.26%, Val. Loss: 0.660, Val. Acc: 61.50%
Epoch: 04, Train Loss: 0.638, Train Acc: 64.96%, Val. Loss: 0.638, Val. Acc: 65.18%
Epoch: 05, Train Loss: 0.546, Train Acc: 73.23%, Val. Loss: 0.523, Val. Acc: 76.56%


In [16]:
# test the model and print the accuracy rate
test_loss_lstm, test_acc_lstm = evaluate(model_lstm, test_iterator, criterion)

print(f'Test Loss: {test_loss_lstm:.3f}, Test Acc: {test_acc_lstm*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.581, Test Acc: 72.73%


In [17]:
# train the GRU model for five times
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss_gru, train_acc_gru = train(model_gru, train_iterator, optimizer_gru, criterion)
    valid_loss_gru, valid_acc_gru = evaluate(model_gru, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_gru:.3f}, Train Acc: {train_acc_gru*100:.2f}%, Val. Loss: {valid_loss_gru:.3f}, Val. Acc: {valid_acc_gru*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.672, Train Acc: 57.03%, Val. Loss: 0.473, Val. Acc: 79.01%
Epoch: 02, Train Loss: 0.352, Train Acc: 85.22%, Val. Loss: 0.277, Val. Acc: 89.22%
Epoch: 03, Train Loss: 0.222, Train Acc: 91.39%, Val. Loss: 0.255, Val. Acc: 90.37%
Epoch: 04, Train Loss: 0.154, Train Acc: 94.33%, Val. Loss: 0.285, Val. Acc: 89.09%
Epoch: 05, Train Loss: 0.117, Train Acc: 95.89%, Val. Loss: 0.287, Val. Acc: 90.08%


In [18]:
# test the model and print the accuracy rate
test_loss_gru, test_acc_gru = evaluate(model_gru, test_iterator, criterion)
torch.cuda.empty_cache()
print(f'Test Loss: {test_loss_gru:.3f}, Test Acc: {test_acc_gru*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.338, Test Acc: 87.49%


# Predict the sentiments

In [19]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_lstm(sentence):
    """
    Predict the sentiment when given a sentence using LSTM model
    sentence: given a sentence
    :return: a number between 0 and 1 
             the closer to 1, the more positive the sentiment is
             the closer to 0, the more negative the sentiment is
    """
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model_lstm(tensor))
    return prediction.item()

def predict_sentiment_gru(sentence):
    """
    Predict the sentiment when given a sentence using GRU model
    sentence: given a sentence
    :return: a number between 0 and 1 
             the closer to 1, the more positive the sentiment is
             the closer to 0, the more negative the sentiment is
    """
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model_gru(tensor))
    return prediction.item()

In [21]:
# predict the sentiment using LSTM Model
print(predict_sentiment_lstm("This film is terrible"))
print(predict_sentiment_lstm("This film is great"))

0.21193359792232513
0.8112065196037292




In [22]:
# predict the sentiment using gru Model
print(predict_sentiment_gru("This film is terrible"))
print(predict_sentiment_gru("This film is great"))

0.25392991304397583
0.9378848671913147




# Comparion between LSTM and GRU Models

We can apparently see that GRU performs better than LSTM with this dataset as the test accuracy rate of GRU is 87.49% while the test accuracy rate of LSTM is only 72.73%.