In [1]:
import re
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch as th
import torch.autograd as ag
import torch.nn.functional as F
import torch.nn as nn

# Deep Learning for NLP - lab exercise 2


## Data

The data can be download here: http://teaching.caio-corro.fr/2019-2020/OPT7/imdb.zip

There are two files: one with positive reviews (imdb.pos) and one with negative reviews (imdb.neg). Each file contains 300000 reviews, one per line.


The following functions can be used to load and clean the data.

In [2]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    f = open(filename)
    dataset=[]
    line =  f.readline()
    cpt=1
    skip=0
    while line :
        cleanline = clean_str(f.readline()).split()
        if cleanline: 
            dataset.append(cleanline)
        else: 
            line = f.readline()
            skip+=1
            continue
        if limit > 0 and cpt >= limit: 
            break
        line = f.readline()
        cpt+=1        
        
    f.close()
    print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset


The following cell load the first 5000 sentences in each review set.

In [3]:
LIM=5000
txtfile = "./Data/imdb.pos"  # path of the file containing positive reviews
postxt = loadTexts(txtfile,limit=LIM)

txtfile = "./Data/imdb.neg"  # path of the file containing negative reviews
negtxt = loadTexts(txtfile,limit=LIM)

Load  5000  lines from  ./Data/imdb.pos  /  1  lines discarded
Load  5000  lines from  ./Data/imdb.neg  /  1  lines discarded


In [4]:
postxt[:10]

[['do', "n't", 'miss', 'it', 'if', 'you', 'can'],
 ['dreams', 'of', 'a', 'young', 'girl'],
 ['funny', 'funny', 'movie', '!'],
 ['pride', 'and', 'prejudice', 'is', 'absolutely', 'amazing', '!', '!'],
 ['quirky', 'and', 'effective'],
 ['mike', 'leigh', "'s", 'best', 'and', 'the', 'best', 'of', '2010'],
 ['an', 'experience', 'unmatched', 'in', 'film'],
 ['if',
  'john',
  'woo',
  'were',
  'to',
  'of',
  'filmed',
  'the',
  'wizard',
  'of',
  'oz',
  'on',
  'the',
  'set',
  'of',
  'the',
  'wild',
  'bunch'],
 ['it',
  'has',
  'its',
  'shortcomings',
  ',',
  'and',
  'i',
  'presume',
  'the',
  'book',
  'would',
  'be',
  'much',
  'better',
  'but',
  'it',
  'is',
  'still',
  'well',
  'worth',
  'watching'],
 ['csi', 'meets', 'the', 'x', 'files']]

Split the data between train / dev / test, for example by creating lists txt_train, label_train, txt_dev, ... You should take care to keep a 50/50 ratio between positive and negative instances in each set.

In [5]:
# A label of 1 means that the review is positive, 0 means negative

label_pos = [1. for i in range(len(postxt))]
label_neg = [0. for i in range(len(negtxt))]

In [6]:
trainset = postxt + negtxt
labels = label_pos + label_neg

test_size = 0.2
dev_size = 0.2

# Split off test set
X_traindev, X_test, y_traindev, y_test = train_test_split(trainset,
                                                         labels,
                                                         test_size=test_size,
                                                         random_state=42,
                                                         stratify=labels)
# Divide leftover data in train and dev set
X_train, X_dev, y_train, y_dev = train_test_split(X_traindev,
                                                  y_traindev,
                                                  test_size=dev_size/(1-test_size),
                                                  random_state=42,
                                                  stratify=y_traindev)    

In [7]:
X_train[:10]

[['far', 'and', 'away', ',', 'the', 'best', 'of', 'the', 'draculas', '!'],
 ['hg', 'wells', 'in', 'name', 'alone'],
 ['horrible'],
 ['what', 'the', 'hell', '!', '!', '!'],
 ['not',
  'quite',
  'a',
  'classic',
  ',',
  'but',
  'worth',
  'the',
  'watch',
  'all',
  'the',
  'same'],
 ['avatar', 'pocahontas', 'in', 'space'],
 ['blah', 'fest', '2003'],
 ['worst', 'cooper', 'movie', 'ever'],
 ['boring', ',', 'inaccurate', ',', 'uninspired'],
 ['a', 'great', 'hammer', 'film', 'even', 'without', 'christopher', 'lee']]

In [8]:
y_train[:10]

[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]

# Converting data to Pytorch tensors

We will first convert data to Pytorch tensors so they can be used in a neural network.
To do that, you must first create a dictionnary that will map words to integers.
Add to the dictionnary only words that are in the training set (be sure to understand why we do that!).

Then, you can convert the data to tensors:
- use tensors of longs: both the sentence and the label will be represented as integers, not floats!
- these tensors do not require a gradient

A tensor representing a sentence is composed of the integer representation of each word, e.g. [10, 256, 3, 4].
Note that some words in the dev and test sets may not be in the dictionnary! (i.e. unknown words)
You can just skip them.

In [9]:
vocab_size = 0
dictionary = {}
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        if X_train[i][j] not in dictionary:
            dictionary[X_train[i][j]] = vocab_size
            vocab_size += 1

In [10]:
def line_to_tensor(line):
    return th.LongTensor([dictionary[word] for word in line if word in dictionary])

def lines_to_tensors(data):
    max_len = max(len(line) for line in data)
    tensors = th.LongTensor(np.zeros((len(data), max_len), dtype='int'))
    for i in range(len(data)):
        tensor = line_to_tensor(data[i])
        tensors[i, :len(tensor)] = tensor
    return tensors

It's handiest to already have all the sentences in the dataset transformed to tensors so we can have a cleaner training loop.

In [11]:
X_train_tnsr, X_dev_tnsr, X_test_tnsr = lines_to_tensors(X_train), lines_to_tensors(X_dev), lines_to_tensors(X_test)

# The labels need to be transformed into tensors, which is what the following code does
y_train = th.Tensor(np.array(y_train))
y_dev = th.Tensor(np.array(y_dev))
y_test = th.Tensor(np.array(y_test))

In [12]:
print(X_train_tnsr, len(X_train_tnsr[2]))

tensor([[   0,    1,    2,  ...,    0,    0,    0],
        [   9,   10,   11,  ...,    0,    0,    0],
        [  14,    0,    0,  ...,    0,    0,    0],
        ...,
        [5947,    0,    0,  ...,    0,    0,    0],
        [5948,  795,    0,  ...,    0,    0,    0],
        [   5,  390,  391,  ...,    0,    0,    0]]) 41


Apparently the maximum length of a sentence in the training set is 41, thus all the tensors will have this length. The sentences that are shorter are transformed into a tensor with zeros padded at the end.

# Neural network definition

In [13]:
class CNN_layer1DBatched(nn.Module):
    def __init__(self, emb_dim, window_size, vocab_size, n_filters, device):
        super(CNN_layer1DBatched, self).__init__()
        self.emb_dim = emb_dim
        self.window_size = window_size
        self.n_filters = n_filters
        self.device = device
        
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_dim)
        self.conv1 = nn.Linear(window_size * emb_dim, n_filters)
        
        fulconn_layers = []
        fulconn_layers.append(nn.ReLU())
        fulconn_layers.append(nn.Linear(n_filters, 1))
        self.fulconn = nn.Sequential(*fulconn_layers)
        
    def sliding_window(self, embs, n_window_steps, output_tensor):
        '''
        This function takes the following inputs:
        1. Sentence embeddings (torch vector)
        2. n_window_steps, or sentence length divided by window_size divided by
        one.
        3. The tensor to store the outputs in.
        '''
        for step in range(n_window_steps):
            embs_before_concat = [embs[:, step+i, :] for i in range(self.window_size)]
            concat_array = th.cat(embs_before_concat, axis=1)
            output_tensor[:, :, step] = self.conv1(concat_array)
        return output_tensor
            
    def forward(self, inputs):
        n_window_steps = inputs.shape[1] - self.window_size + 1
        conv_output = th.zeros(inputs.shape[0], self.n_filters, n_window_steps)
        embs = self.embeddings(inputs)
        conv_output = self.sliding_window(embs, n_window_steps, conv_output)
        pooling_layer = nn.MaxPool1d(kernel_size = n_window_steps)
        out = F.relu(pooling_layer(conv_output))
        out = self.fulconn((out.reshape(inputs.shape[0],-1)).to(self.device)).reshape((-1))
        return out

## Loss function

Create a loss function builder.

- Pytorch loss functions are documented here: https://pytorch.org/docs/stable/nn.html#loss-functions
- In our case, we are interested in *BCELoss* and *BCEWithLogitsLoss*. Read their documentation and choose the one that fits with your network output

In [14]:
BCELogitLoss = th.nn.BCEWithLogitsLoss()

## Training loop

Write your training loop!

- parameterizable number of epochs
- at each epoch, print the mean loss and the dev accuracy

In [15]:
class trainingLoopClassification():
    def __init__(self, model, optimizer, loss, accuracy, n_epochs, batch_size, gradient_clip, device):
        self.model = model
        self.optimizer = optimizer
        self.loss = loss
        self.accuracy = accuracy
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.gradient_clip = gradient_clip
        self.device = device
        self.model.to(device)
        
    def init_weights(self, m):
        '''
        This function initializes the weights and biases of all the layers in the
        network. This function automatically only initializes the nn.Linear type
        objects due to the if-statement check. It uses kaiming initialization and
        it initializes the biases with zeros.
        '''
        
        if type(m) == nn.Linear:
            nn.init.kaiming_uniform_(m.weight.data) 
            nn.init.zeros_(m.bias.data)   
        
    def train(self, train_data, train_labels, dev_data, dev_labels):
        self.model.apply(self.init_weights)
        
        loss_list = []
        dev_acc_list = []
        
        for epoch in range(n_epochs):
            print('Starting epoch: {}'.format(epoch))
            self.model.train()
            cost = 0
            
            for first in range(0, len(train_data), batch_size):
                self.model.zero_grad()                                     
                
                batch_input = th.cat(
                    [
                        sentence.reshape(1, -1)
                        for sentence in train_data[first:first + batch_size]
                        
                    ],
                        dim=0
                ).to(self.device)
                batch_labels = train_labels[first:first + batch_size].to(self.device)
                
                output = self.model(batch_input)
                                                
                loss = self.loss(output, batch_labels)
                                
                loss.backward()
                
                th.nn.utils.clip_grad_value_(self.model.parameters(), self.gradient_clip)
                
                self.optimizer.step()
                
                cost += loss.item()
                
            mean_loss = cost / (len(train_data)/batch_size+1)
            acc = self.accuracy(self.model, dev_data, dev_labels, self.device)
            
            print('mean loss: ', mean_loss)
            print('dev accuracy: ', acc)
            
            loss_list.append(mean_loss)
            dev_acc_list.append(acc)
        
        return self.model, loss_list, dev_acc_list
    
    def plot_graphs(self,
                    mean_losss, 
                    dev_accus, 
                    embedding_size, 
                    n_hidden_layers):
        plt.plot([i for i in range(EPOCHS)],mean_losss, label='mean loss')
        plt.plot([i for i in range(EPOCHS)],dev_accus, label='accuracy on dev')
        plt.xlabel('Epochs')
        plt.title('{} hidden layers, {} embedding_size'.format(n_hidden_layers,
                                                              embedding_size))
        plt.legend()
        plt.savefig('{}HiddenLayer{}.png'.format(n_hidden_layers,
                                                embedding_size))
        plt.show()

In [16]:
def accuracy(model,
             X, 
             y,
            device):
    correct = 0
    total = 0
    for data, label in zip(X, y):
        x = data
        if x.size()[0] <= 0:
            continue
        output = model(x.reshape(1, -1).to(device))
        total += 1
        if (output >= 0) == (label == 1.):
            correct += 1
    return correct/total * 100

In [17]:
embedding_size = 10
window_size = 2
vocab_size = vocab_size
n_filters = 10
device = th.device('cuda')
    
model = CNN_layer1DBatched(embedding_size, window_size, vocab_size, n_filters, device)

lr = 0.001
optimizer = th.optim.Adam(model.parameters(),lr=lr)

loss = BCELogitLoss
accuracy = accuracy
n_epochs = 20
batch_size = 32
gradient_clip = 5.

models_list = []
mean_loss_list = []
dev_accuracy_list = []

training_loop = trainingLoopClassification(model, optimizer, loss, accuracy, n_epochs, batch_size, gradient_clip, device)

model, mean_losses, dev_accus = training_loop.train(X_train_tnsr, y_train, X_dev_tnsr, y_dev)

models_list.append(model)
mean_loss_list.append(mean_losses)
dev_accuracy_list.append(dev_accus)

Starting epoch: 0
mean loss:  1.0118235447678705
dev accuracy:  52.800000000000004
Starting epoch: 1
mean loss:  0.7267079283767417
dev accuracy:  56.75
Starting epoch: 2
mean loss:  0.6767485669816521
dev accuracy:  59.099999999999994
Starting epoch: 3
mean loss:  0.6442383276372753
dev accuracy:  60.650000000000006
Starting epoch: 4
mean loss:  0.6170584455092959
dev accuracy:  62.0
Starting epoch: 5
mean loss:  0.5921059342531058
dev accuracy:  62.8
Starting epoch: 6
mean loss:  0.5683864853110174
dev accuracy:  63.4
Starting epoch: 7
mean loss:  0.5451755930005081
dev accuracy:  64.35
Starting epoch: 8
mean loss:  0.5223627639069798
dev accuracy:  64.9
Starting epoch: 9
mean loss:  0.4999270685787859
dev accuracy:  65.85
Starting epoch: 10
mean loss:  0.47810708670148166
dev accuracy:  66.60000000000001
Starting epoch: 11
mean loss:  0.4566439964094592
dev accuracy:  66.60000000000001
Starting epoch: 12
mean loss:  0.435531118029941
dev accuracy:  66.85
Starting epoch: 13
mean loss

In [18]:
test_accuracy = accuracy(model, X_test_tnsr, y_test, device)
test_accuracy

70.3