In [1]:
from tqdm.notebook import tqdm
import pickle
import numpy as np
import torch

## Task I: Word-based CNN for Text Classification

### 1. Data

The dataset that we are going to use is the imdb dataset of movie reviews. These are labelled by sentiment (positive/negative). 

The reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). 

For convenience, words are indexed by overall frequency in the dataset, so that for instance the integer "3" encodes the 3rd most frequent word in the data. This allows for quick filtering operations such as: "only consider the top 10,000 most common words, but eliminate the top 20 most common words".

More information regarding the dataset can be found in the official [documentation](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification).


In [2]:
# Load the data
train_samples, train_labels, test_samples, test_labels = pickle.load(open('./imdb.pkl', 'rb'))

print(len(train_samples), 'train sequences')
print(len(test_samples), 'test sequences')

25000 train sequences
25000 test sequences


### 2. Preprocess the text data 

In this particular case, where we are using the imdb dataset there is no need to do all the traditional preprocessings that we normally do when dealing with NLP problems. Part of them are already done at this point.

  - Split the dataset in train and test (maybe also validation).
  - Tokenize and transform to integer index. Here we would need to: 
    - instantitate a *Tokenizer()* object, 
    - fit that object on the text on which we are training the model (use the *fit_on_texts()* method)
    - call *texts_to_sequences()* for both the training and the test text.

  - **Add padding to ensure that all vectors have the same dimensionality.** Note that this is the only pre-processing that needs to be done in the case of the current imdb dataset.

In [3]:
desired_length = 1000

# TODO 2. Pad sequences

# define a function that receives a set of samples and a desired length and:
# 1. cuts all sequences longer than the desired length to the desired length
# (one improvement would be to perform this action in the torch dataset and feed a random subseqence 
# of the desired length each time)
# 2. pad shorter sequences with the value 10001

def pad(samples, length):
    return np.array([
        sample[:length] if len(sample) > length else list(sample) + [10001]*(length - len(sample))
        for sample in samples]
    )


train_samples = pad(train_samples, desired_length)
test_samples = pad(test_samples, desired_length)

print('train_samples shape:', train_samples.shape)
print('test_samples shape:', test_samples.shape)

train_samples shape: (25000, 1000)
test_samples shape: (25000, 1000)


### 3.  Define the model de dataset and the training loop

Similar to the privious lab while following the model architecture described in the comments.

In [4]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        # Define an embedding layer with a vocabulary size of 10002
        # an output embedding size of 100
        # and a padding_idx equal to the one used - 10001
        self.embedding = torch.nn.Embedding(10002, 100, 10001)
        self.drop = torch.nn.Dropout(0.4)
        self.conv1 = torch.nn.Sequential(
        # A 1D Convolutional layer with 100 input channels, 128 output channels, kernel size of 3 and a padding of 1
        # A 1D Batch Normalization Layer for 128 features
        # A ReLU activation
        # A 1D Maxpooling layer with size 2
            torch.nn.Conv1d(100, 128, 3, padding=1),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(2))
        self.conv2 = torch.nn.Sequential(

        # A 1D Convolutional layer with 128 input channels, 128 output channels, kernel size of 5 and a padding of 2
        # A 1D Batch Normalization Layer for 128 features
        # A ReLU activation
        # A 1D Maxpooling layer with size 2
            torch.nn.Conv1d(128, 128, 5, padding=2),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(2))
        self.conv3 = torch.nn.Sequential(
        
        # A 1D Convolutional layer with 128 input channels, 128 output channels, kernel size of 5 and a padding of 2
        # A 1D Batch Normalization Layer for 128 features
        # A ReLU activation
        # A 1D Maxpooling layer with size 2
            torch.nn.Conv1d(128, 128, 5, padding=2),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(2))


        # A global Average pooling layer, which in this scenario, will be an 1D Avgerage Pooling layer
        # with size 125 and stride 125
        self.avg = torch.nn.AvgPool1d(125,125)
        # A Flattening layer
        self.flat =    torch.nn.Flatten()
        # A Linear layer with 128 input features and 2 outputs and no activation function
        self.lin =     torch.nn.Linear(128, 2)
        
    def forward(self, input):
        # forward the input through the embedding layer
        out = self.embedding(input)
        out = self.drop(out)
        # permute the input such that it becomes channels first
        out = out.permute(0,2,1)

        # print(out.shape)
        # forward the input through the rest of the sequence of layers
        out = self.conv1(out)
        # print(out.shape)
        out = self.conv2(out)
        # print(out.shape)
        out = self.conv3(out)
        # print(out.shape)
        out = self.avg(out)
        # print(out.shape)
        out = self.flat(out)
        # print(out.shape)
        out = self.lin(out)
        return out
        


In [5]:
        
# define a dataset class that feeds the samples from train or test, depending on the use case
class Dataset(torch.utils.data.Dataset):
    def __init__(self, use = 'train'):
        if use == 'train':
            
            self.data = train_samples
            self.labels = train_labels
        elif use == 'test':
            self.data = test_samples
            self.labels = test_labels
            
    def __getitem__(self, sample_n):
        return self.data[sample_n], self.labels[sample_n]
    
    def __len__(self):
        return self.data.shape[0]
device = torch.device('cuda')
# instantiate the model
model = Model()
model.to(device)
# define an Adam optimizer for the model with a lr of 1e-3
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
# define a Cross Entropy loss function
loss_func = torch.nn.CrossEntropyLoss()

# define the training dataset and dataloader and the test dataset and dataloader
train_ds = Dataset()
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=64)
test_ds = Dataset(use='test')
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=64)

# write the training loop as defined in Lab 1 and train the model

In [6]:
def train(model, optimizer, loss_func, dl, val_dl, epochs):
  
    for epoch in tqdm(range(epochs)):
        for img, label in tqdm(train_dl):
            
            optimizer.zero_grad()     
            
            img = img.to(device)
            label = label.to(device)
            
            output = model(img)
            
            loss = loss_func(output, label)
            loss.backward()
            optimizer.step()
        accs = []
        # return
        for batch in val_dl:

            img, label = batch
            img = img.to(device)
            label = label.to(device)
            with torch.no_grad():
                output = model(img)

            predict = output.argmax(1)
            acc = (predict == label).float().mean().detach().cpu().numpy()
            accs.append(acc)
        print(f{np.mean(accs):.3f})

In [7]:
train(model, optimizer, loss_func, train_dl, test_dl, 2)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

0.8065218


  0%|          | 0/391 [00:00<?, ?it/s]

0.8435023
