# Introduction of Q2
In this question, you will utilize Torch to implement a Recurrent Neural Network, specifically the LSTM model, for sentiment analysis. Sentiment analysis involves classifying the emotional tone of a sentence as either positive or negative. You will work with a preprocessed dataset consisting of 25,000 sentences about movie review, which are stored in three files: 'label.pkl', 'encoded_input.pkl', and 'vocab_to_int.pkl'. The 'label.pkl' file contains the sentence labels, while the 'encoded_input.pkl' file contains the encoded IDs of the sentences, where each word has its own ID. The translation dictionary is saved in the 'vocab_to_int.pkl' file. Your objective will be to preprocess the data, implement the model, and train and evaluate its performance.

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pickle
from string import punctuation

In [None]:
!mkdir data
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1oJWIa3esCmlHeICCqtLVoIYe65IdXqwl'  -c -O label.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1vuW6eDpScDoyiJ66K328mYg0Ky6hZrij'  -c -O encoded_input.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1lON426HKNYnONwUn6TtHnd3zIq2JbBBJ'  -c -O vocab_to_int.pkl
!mv *.pkl data/

In [None]:
## load dataset
with open('./data/encoded_input.pkl', 'rb') as f:
  encoded_input = pickle.load(f)

with open('./data/label.pkl', 'rb') as f:
  labels = pickle.load(f)

with open('./data/vocab_to_int.pkl', 'rb') as f:
  vocab_to_int = pickle.load(f)


print(labels[:10])
print(encoded_input[:10])
print(vocab_to_int)

In [None]:
## pad the encoded input to the same shape
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's
        or truncated to the input seq_length.
    '''
    ## getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    ## for each review, I grab that review
    for i, row in enumerate(reviews_ints):
      features[i, -len(row):] = np.array(row)[:seq_length]

    return features

In [None]:
# The maximum length is too large, so we set the maximum length to 200
max_length = max(len(x) for x in encoded_input)
print('original maximum length: ', max_length)

max_length_padding = 200
padded_input = pad_features(encoded_input, max_length_padding)
print('new maximum length: ', max_length_padding)

In [None]:
train_frac = 0.8
val_frac = 0.1
eval_frac = 0.1

total_size = len(labels)

# split the dataset for training, validation and evaluation.
"""
Begining of of Implement

1. split padded_input into variables "train_x", "val_x", "test_x";
2. split labels into variables "train_y", "val_y", "test_y";
"""

# your code here

"""
End of Implement
"""



## print out the shapes of your resultant feature data
print("\t\t\tFeatures Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

In [None]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# SHUFFLE data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
# checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

In [None]:
## Define the recurrent neural network

class RNN_model(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob):
        super(RNN_model, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # embedding and LSTM layers
        """
        Begining of of Implement

        1. define self.embedding (vocab_size * embedding_dim) for the vocabulary using nn.Embedding
        2. define self.lstm with input of embedding_dim, hidden_dim, n_layers, and dropout probability of drop_prob.
        Note that batch_first in lstm need to be set to 'True' to process the shape of (batch_size, seq_len, features)
        3. add a dropout layer self.dropout with probability of 0.3 before the linear layer.
        4. add a linear layer self.fc (hidden_dim * outputsize) and a sigmoid output layer self.sig.
        """

        # your code here

        """
        End of Implement
        """



    def forward(self, x, hidden):
        # Perform a forward pass of our model on some input and hidden state.
        batch_size = x.size(0)

        """
        Begining of of Implement

        1. get the embeddings of x using self.embedding -> variable "embeds".
        2. get the output of lstm using self.lstm with inputs "embeds" and "hidden" -> "lstm_out" and "hidden".
        3. get the output of the dropout layer and the linear layer -> variable "out".
        4. get the output of the output layer -> variable "sig_out".
        5. return the last sigmoid output and the hidden state -> variables "sig_out" and "hidden"
        """

        # your code here

        """
        End of Implement
        """

        # return last sigmoid output and hidden state
        return sig_out, hidden


    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if(train_on_gpu):
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden


In [None]:
# Instantiate the model hyperparams
vocab_size = len(vocab_to_int) + 1 # +1 for zero padding + our word tokens
output_size = 1
embedding_dim = 400 # size of the embeddings
hidden_dim = 256    # Number of units in the hidden layers of our LSTM cells
n_layers = 2        # Number of LSTM layers
dropout_prob = 0.5

"""
Begining of of Implement

Define the RNN model using the above RNN_model class and hyperparameters
"""

# your code here

"""
End of Implement
"""
print(net)

In [None]:
# learning rate, loss function, and optimization functions
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [None]:
# Begin to train the networks
# training params
epochs = 4
counter = 0
print_every = 100
clip=5  # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()


net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])


        # calculate the loss and perform backprop
        """
        Begining of of Implement
        do the optimization step:
        1. get the output from the model -> variables "output" and "h".
        2. calculate the loss using criterion, output, and the labels -> variables "loss".
        3. clean the old/previous gradient;
        4. compute the current gradient (backward propagation)
        5. update the parameter
        """
        # your code here

        """
        End of Implement
        """


        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

In [None]:
# Get test data loss and accuracy
test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:
    # Creating new variables for the hidden state, otherwise we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    """
    Begining of of Implement
    do the optimization step:
    1. get the output from the model -> variables "output" and "h".
    2. calculate the loss using criterion, output, and the labels -> variables "test_loss".
    3. append the "test_loss" into test_losses.
    4. convert output probabilities to predicted class (0 or 1)
    """
    # your code here

    """
    End of Implement
    """

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

In [None]:
def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuatuon
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])

    return test_ints

In [None]:
def predict(net, test_review, sequence_length=200):
    ''' Prints out whether a give review is predicted to be
        positive or negative in sentiment, using a trained model.

        params:
        net - A trained net
        test_review - a review made of normal text and punctuation
        sequence_length - the padded length of a review
    '''

    net.eval()

    # tokenize review
    test_ints = tokenize_review(test_review)

    # pad tokenize sequence
    seq_length = sequence_length
    features = pad_features(test_ints, seq_length)

    # convert to tensor to pass to model
    feature_tensor = torch.from_numpy(features)

    batch_size = feature_tensor.size(0)

    # initialize hidden state
    h = net.init_hidden(batch_size)

    if(train_on_gpu):
      feature_tensor = feature_tensor.cuda()

    # get the output from the model
    output, h = net(feature_tensor, h)

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))

    # print custom response based on whether test_review is pos/neg
    if(pred.item()==1):
      print('Positive review detected!')
    else:
      print('Negative review detected!')



In [None]:
# positive test review
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'
# negative test review
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'

In [None]:
# call function to test your model!
seq_length=200
predict(net, test_review_pos, seq_length)
predict(net, test_review_neg, seq_length)