Task 4: Train an LSTM Model (40 points)
----
1. Using PyTorch, implement a neural network that uses one or more LSTM cells to do sentiment analysis. Use the nn.Embedding, nn.LSTM and nn.Linear layers to construct your model.
2. Note that sequence processing works differently with the PyTorch Embedding layer as compared to my sample code from class. The model input expects a padded tensor of token indices from the vocabulary, instead of one-hot encodings. For evaluation, use a vocabulary size of 10000 (max_features = 10000).
3. The model should have a single output with the sigmoid activation function for classification. The dimensions of the embedding layer and the hidden layer(s) are up to you, but please make sure your model does not take more than ~3 minutes to train.
4. Evaluate the model using PyTorch functions for average accuracy, area under the ROC curve and F1 scores (see [torchedev](https://pytorch.org/torcheval/stable/)).

In [1]:
    import numpy as np
    import pandas as pd
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from nltk.corpus import stopwords
    from collections import Counter
    import string
    import re
    import seaborn as sns
    from tqdm import tqdm
    import matplotlib.pyplot as plt
    from torch.utils.data import TensorDataset, DataLoader
    from sklearn.model_selection import train_test_split



In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available")
else:
    device = torch.device("cpu")
    print("CPU used")

GPU is available


In [3]:
train_data_file = 'movie_reviews_train.txt'
train_df = pd.read_csv(train_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_train, y_train = train_df['review'].values, train_df['label'].values

dev_data_file = 'movie_reviews_dev.txt'
dev_df = pd.read_csv(dev_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_dev, y_dev = dev_df['review'].values, dev_df['label'].values

test_data_file = 'movie_reviews_test.txt'
test_df = pd.read_csv(test_data_file, sep='\t', header=None, names=['id', 'review', 'label'])[['review', 'label']]
X_test, y_test = test_df['review'].values, test_df['label'].values

In [4]:
def preprocess_token(s): # This function is for pre-processing each token, not the entire sequence
    # Retain only alphanumeric characters
    s = re.sub(r'[^a-zA-Z0-9]', '', s)
    # replace digits with no space
    s = re.sub(r'\d', '', s)
    # Replace all whitespace sequences with no space
    s = re.sub(r'\s+', '', s)
    return s

def tokenize(x_train, x_dev, x_test, vocab_size): # This function is for pre-processing strings, which uses the above.
    stop_words = set(stopwords.words('english'))

    all_tokens = []
    for text in np.concatenate((x_train, x_dev, x_test)):
        for word in text.split():
            if word.lower() not in stop_words:# and preprocess_token(word) != '':
                all_tokens.append(preprocess_token(word))
                
    word_counts = Counter(all_tokens)              
    vocab = Counter()
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.most_common(vocab_size-1))}
    vocab["<PAD>"] = 0
    
    x_train_seq = []
    x_dev_seq = []
    x_test_seq = []

    for doc in x_train:
        tokens = [preprocess_token(word.lower()) for word in doc.split() if word.lower() not in stop_words]
        sequence = [vocab.get(token) for token in tokens if token in vocab]
        x_train_seq.append(sequence)
#         pass

    for doc in x_dev:
        tokens = [preprocess_token(word.lower()) for word in doc.split() if word.lower() not in stop_words]
        sequence = [vocab.get(token) for token in tokens if token in vocab]
        x_dev_seq.append(sequence)

    for doc in x_test:
        tokens = [preprocess_token(word.lower()) for word in doc.split() if word.lower() not in stop_words]
        sequence = [vocab.get(token) for token in tokens if token in vocab]
        x_test_seq.append(sequence)
        
    max_len = max(max(len(seq) for seq in sequences) for sequences in [x_train_seq, x_dev_seq, x_test_seq])

    x_train_seq = [[0]*(max_len-len(seq)) + seq for seq in x_train_seq]
    x_dev_seq = [[0]*(max_len-len(seq)) + seq for seq in x_dev_seq]
    x_test_seq = [[0]*(max_len-len(seq)) + seq for seq in x_test_seq]

    return x_train_seq,x_dev_seq,x_test_seq,vocab



In [5]:
# Tokenize your train, test and development data

### YOUR CODE HERE ###
train_padded, dev_padded, test_padded, vocab = tokenize(X_train, X_dev, X_test, vocab_size=10000)
train_padded = torch.tensor(train_padded, dtype=torch.long)
dev_padded = torch.tensor(dev_padded, dtype=torch.long)
test_padded = torch.tensor(test_padded, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_dev = torch.tensor(y_dev, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [6]:
train_data = TensorDataset(train_padded, y_train)
dev_data = TensorDataset(dev_padded, y_dev)
test_data = TensorDataset(test_padded, y_test)

batch_size = 50
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


In [7]:
class SentimentRNN(nn.Module):
    def __init__(self,num_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        self.hidden_dim = hidden_dim
        self.no_layers = num_layers

        # embedding and LSTM layers

        ###### YOUR CODE HERE #######
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=drop_prob)

        # Dropout layer
        self.dropout = nn.Dropout(drop_prob)

        # Linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self,x,hidden):

        # embeddings and lstm_out

        ### YOUR CODE HERE ###
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        out = self.dropout(lstm_out)
        out = self.fc(out)
        y_pred = self.sigmoid(out)
        y_pred = y_pred.view(batch_size, -1)
 
        y_pred = y_pred[:, -1]
        return y_pred, hidden        

#         pass

    def init_hidden(self, batch_size):
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden


In [8]:
no_layers = 4       #tried increasing layers to see change in accuracy
vocab_size = len(vocab)
embedding_dim = 64      # Increased embedding dimension for trial
output_dim = 1
hidden_dim = 256        # Increased hidden dimension for trial

model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

print(model)

SentimentRNN(
  (embedding): Embedding(10000, 64)
  (lstm): LSTM(64, 256, num_layers=4, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [9]:
lr=0.001

# you should use binary cross-entropy as your loss function and Adam optimizer for this task
import torch.optim as optim
import torch.nn as nn

criterion = nn.BCELoss()  
optimizer = optim.Adam(model.parameters(), lr=lr) 


# function to predict accuracy
def acc(pred,label):

    ### YOUR CODE HERE ###

    # pass
    pred_rounded = torch.round(pred.squeeze())
    return torch.sum(pred_rounded==label.squeeze()).item()

In [10]:
clip = 5
epochs = 10
valid_loss_min = np.Inf
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]

for epoch in range(epochs):
    train_losses, train_acc = [], 0.0
    model.train()
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device), labels.to(device)   
        h = tuple([each.data for each in h])
        
        model.zero_grad()
        output,h = model(inputs,h)
        
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        accuracy = acc(output,labels)
        train_acc += accuracy
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
    val_h = model.init_hidden(batch_size)
    val_losses, val_acc = [], 0.0
    model.eval()
    for inputs, labels in dev_loader:
            val_h = tuple([each.data for each in val_h])
            inputs, labels = inputs.to(device), labels.to(device)
            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())
            val_losses.append(val_loss.item())  
            accuracy = acc(output,labels)
            val_acc += accuracy
            
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(dev_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc} val_accuracy : {epoch_val_acc}')
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), 'model.pt')
        print('Validation loss decreased ({:.5f} --> {:.5f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    print(50 * '==')

Epoch 1
train_loss : 0.694859741255641 val_loss : 0.6926769614219666
train_accuracy : 0.480625 val_accuracy : 0.57
Validation loss decreased (inf --> 0.69268).  Saving model ...
Epoch 2
train_loss : 0.6832661777734756 val_loss : 0.6868956238031387
train_accuracy : 0.578125 val_accuracy : 0.5
Validation loss decreased (0.69268 --> 0.68690).  Saving model ...
Epoch 3
train_loss : 0.6244987193495035 val_loss : 0.724558025598526
train_accuracy : 0.658125 val_accuracy : 0.615
Epoch 4
train_loss : 0.5160010028630495 val_loss : 0.7529969215393066
train_accuracy : 0.759375 val_accuracy : 0.61
Epoch 5
train_loss : 0.40143234468996525 val_loss : 0.8260402530431747
train_accuracy : 0.825625 val_accuracy : 0.62
Epoch 6
train_loss : 0.30122075136750937 val_loss : 0.9024200737476349
train_accuracy : 0.88125 val_accuracy : 0.635
Epoch 7
train_loss : 0.1987796991597861 val_loss : 1.0874906182289124
train_accuracy : 0.93875 val_accuracy : 0.605
Epoch 8
train_loss : 0.1543990708887577 val_loss : 1.16102

NOTE: your train loss should be smaller than 1 and your train accuracy should be over 75%

In [11]:
model.eval()
test_h = model.init_hidden(batch_size)
test_acc = 0.0

# Evaluate model on your test data and report the accuracy

### YOUR CODE HERE ###
model.eval()
test_acc = 0.0
test_h = model.init_hidden(batch_size)
for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    test_h = tuple([each.data for each in test_h])
    
    output, test_h = model(inputs, test_h)
    accuracy = acc(output, labels)
    test_acc += accuracy

test_accuracy = test_acc / len(test_loader.dataset)
print(f'Test Accuracy is: {test_accuracy * 100}%')

Test Accuracy is: 62.0%


NOTE: your eval accuracy should be of at least 60%.