In [None]:
import torch
from torch import nn, optim, tensor
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, Vectors
import pandas as pd
import time

pd.set_option('display.max_colwidth', None)


# ========================
# LOADING DATA - GENERAL

class TSVDataset(Dataset):
    
    def __init__(self, filepath):
        """Loads the data from a provided filepath"""
        self.data = list()
        with open(filepath, encoding="utf-8") as in_file:
            for line in in_file:
                (label, text) = line.strip().split("\t")
                self.data.append((label, text))

    def __getitem__(self, idx):
        """Returns the datapoint at a given index"""
        return self.data[idx]
    
    def __len__(self):
        """Returns the number of datapoints in the dataset"""
        return len(self.data)

spacy_tokenizer = get_tokenizer('spacy', language="en_core_web_sm")
tokenizer = lambda text: [token.lower() for token in spacy_tokenizer(text)]
    
def text_to_indices(text):
    tokens = tokenizer(text)
    indices = vocab(tokens)
    return torch.tensor(indices, dtype=torch.int64)

def label_to_index(label):
    return int(label == "pos")

def data_to_indices(data):
    (label, text) = data
    return (label_to_index(label), text_to_indices(text))

train_data = TSVDataset("inputs/imdb-train.tsv")
test_data = TSVDataset("inputs/imdb-test.tsv")


# =================================================
# SETTING UP THE VOCAB AND EMBEDDINGS - GENERAL

def yield_tokens(data):
    """A generator for tokenizing text in a (label, text) pair"""
    for _, text in data:
        yield tokenizer(text)

tokenized_iter = yield_tokens(train_data)
embeddings = Vectors("inputs/glove_6B_50_sample_train.txt")


# ========================================
# TRAINING AND TESTING - GENERAL

def train(model, dataloader, optimizer, epochs=100, print_every=1,
          validation_data=None):
    """Train a PyTorch model and print results periodically
    
    Arguments
    ---------
    model: torch.nn.Module; the model to be trained
    dataloader: torch.utils.data.DataLoader; the training data
    optimizer: the PyTorch optimizer to use for training
    epochs: int; the number of complete cycles through the training data
    print_every: int; print the results after this many epochs
                 (does not print if this is None)
    validation_data: torch.utils.data.DataLoader; the validation data
    """
    start_time = time.time()
    
    if print_every is not None:
        # Print initial performance
        initial_performance = test(model, dataloader)
        log_message = '| epoch   0 | train acc {acc:6.3f} | train loss {loss:6.3f} |'.format(**initial_performance)
        if validation_data is not None:
            validation_performance = test(model, validation_data)
            log_message += ' valid acc {acc:6.3f} | valid loss {loss:6.3f} |'.format(**validation_performance)
        print(log_message)
        
        # Set up trackers for printing results along the way
        total_acc = 0
        total_count = 0
        current_loss = 0.0
        minibatches_per_log = len(dataloader) * print_every
        
    # Tell the model that these inputs will be used for training
    model.train()
        
    for epoch in range(epochs):
        # Within each epoch, iterate over the data in mini-batches
        # Note the use of *datapoint_list for generality, whether or not there are offsets
        for (label_list, *datapoint_list) in dataloader:
            
            # Clear out gradients accumulated from inputs in the previous mini-batch
            model.zero_grad()

            # Run the forward pass to make predictions for the mini-batch
            predicted_probs = model(*datapoint_list).view(-1)

            # Compute the loss and send it backward through the network to get gradients
            # Note: PyTorch averages the loss over all datapoints in the minibatch
            loss = model.loss_function(predicted_probs, label_list.to(torch.float32))
            loss.backward()
            
            # Nudge the weights
            optimizer.step()
            
            # Track performance
            if print_every is not None: 
                total_acc += ((predicted_probs > 0.5).to(torch.int64) == label_list).sum().item()
                total_count += label_list.size(0)
                current_loss += loss.item()

        # Log performance
        if print_every is not None and (epoch + 1) % print_every == 0:
            log_message = ('| epoch {:3d} | train acc {:6.3f} | train loss {:6.3f} |'
                           .format(epoch + 1, total_acc/total_count, current_loss/minibatches_per_log))
            if validation_data is not None:
                validation_performance = test(model, validation_data)
                log_message += ' valid acc {acc:6.3f} | valid loss {loss:6.3f} |'.format(**validation_performance)
            print(log_message)

            # Reset trackers after logging
            total_acc = 0
            total_count = 0
            current_loss = 0.0
            model.train()
            
    print("\nOverall training time: {:.0f} seconds".format(time.time() - start_time))
            
def test(model, dataloader):
    """Evaluate a PyTorch model by testing it on labeled data
    
    Arguments
    ---------
    model: torch.nn.Module; the model to be tested
    dataloader: torch.utils.data.DataLoader; the test data
    """
    # Tell the model that these inputs will be used for evaluation
    model.eval()
    
    # Set up trackers
    total_acc = 0
    total_count = 0
    loss = 0.0

    with torch.no_grad(): # This can speed things up by telling PyTorch to ignore gradients
        # Note the use of *datapoint_list for generality, whether or not there are offsets
        for (label_list, *datapoint_list) in dataloader:
            # Get the model's output predictions
            predicted_probs = model(*datapoint_list).view(-1)
            predicted_labels = (predicted_probs > 0.5).to(torch.int64)
            
            # Calculate the loss and accuracy
            loss += model.loss_function(predicted_probs, label_list.to(torch.float32)).item()
            total_acc += (predicted_labels == label_list).sum().item()
            total_count += label_list.size(0)
    
    performance = {"acc": total_acc/total_count, "loss": loss/len(dataloader)}
    return performance


# ==================================
# INSPECTING A MODEL - GENERAL

def display_weights(model):
    """Prints the weights of a model"""
    for name, param in model.named_parameters():
        print(name.upper(), param)
        print()
        
def predict_multiple(model, texts, collate_batch_fn, labels=["neg", "pos"]):
    """Prints a model's predictions for a list of input texts.
    
    Arguments
    ---------
    model: torch.nn.Module; a PyTorch RNN model
    texts: list(str); a list of untokenized strings to feed as input to the model
    collate_batch_fn: function; a function that is used to prepare (batched) data
                      to be input into the model
    labels: list(str); a list of the labels that correspond to the indices the
            model will output
    """
    # Tell the model not to use these inputs for training
    model.eval()
    
    # Convert the input texts to indices, and get other model arguments needed
    data = [(None, text) for text in texts]
    (_, *model_input) = collate_batch_fn(data)
    
    # Feed the inputs through the model
    with torch.no_grad():
        probs = model(*model_input).view(-1)
    
    # Collate the predictions in a DataFrame
    predictions = pd.DataFrame({"Input text": texts, "Classifier probability": probs})
    predictions["Output label"] = labels[0]
    predictions.loc[predictions["Classifier probability"] > 0.5, "Output label"] = "pos"
    return predictions

        
# =================================
# LOADING DATA - SPECIFIC TO BOE

def collate_batch_boe(batch):
    """Converts a batch of data into PyTorch tensor format, and collates
    the results by label, text, and offset, for use in a bag-of-embeddings
    model.
    """
    # Initialize lists that separate out the three components
    label_list = list()
    text_list = list()
    offsets_list = [0]
    
    for data in batch:
        # Convert to PyTorch format
        (label_index, text_indices) = data_to_indices(data)
        # Add converted data to separate component lists
        label_list.append(label_index)
        text_list.append(text_indices)
        offsets_list.append(text_indices.size(0))
        
    # Convert everything to tensors
    label_tensor = torch.tensor(label_list, dtype=torch.int64)
    text_tensor = torch.cat(text_list)
    offsets_tensor = torch.tensor(offsets_list[:-1]).cumsum(dim=0)
    
    return (label_tensor, text_tensor, offsets_tensor)
          
train_dataloader_boe = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch_boe)
test_dataloader_boe = DataLoader(test_data, batch_size=1000, collate_fn=collate_batch_boe)


In [None]:
class RNNClassifier(nn.Module):

    def __init__(self, vocab, embeddings, recurrent_dim, hidden_dim, freeze_embeddings=True,
                 recurrent_activation="tanh", recurrent_layers=1, recurrent_bidirectional=False):
        super(RNNClassifier, self).__init__()
        
        self.vocab = vocab
        
        vocab_embeddings = embeddings.get_vecs_by_tokens(self.vocab.get_itos())
        padding_idx = self.vocab.get_stoi().get("<pad>")  # Get the <pad> index
        self.embedding = nn.Embedding.from_pretrained(vocab_embeddings, freeze=freeze_embeddings, 
                                                      padding_idx=padding_idx) # Tell PyTorch that <pad> is for padding
        
        # The embeddings go into an RNN layer with recurrent_dim units
        self.recurrent_layer = nn.RNN(embeddings.dim, recurrent_dim, nonlinearity=recurrent_activation,
                                      num_layers=recurrent_layers, bidirectional=recurrent_bidirectional,
                                      batch_first=True) # Because we'll make the mini-batch a list of sequences
        
        # The recurrent output creates a doc_embedding, which feeds into a of hidden_dim units
        # We'll be concatenating the forward and backward direction of all layers
        # from the recurrent output, so the doc_embedding will be sized accordingly
        doc_embedding_dim = recurrent_dim * recurrent_layers * int(1 + recurrent_bidirectional)
        self.hidden_layer = nn.Sequential(
            nn.Linear(doc_embedding_dim, hidden_dim),
            nn.ReLU()
        )
        
        # The output layer will go from the hidden layer (hidden_dim units) to a single unit
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
        
        self.loss_function = nn.BCELoss()

    def forward(self, padded_text, seq_lengths):
        word_embeddings = self.embedding(padded_text)
        
        # The sequence of word embeddings has to be packed for efficiency of the RNN
        packed_word_embeddings = pack_padded_sequence(word_embeddings, seq_lengths, batch_first=True, enforce_sorted=False)
        (final_layer_all_timesteps, all_layers_final_timestep) = self.recurrent_layer(packed_word_embeddings)
        
        # all_layers_final_timestep contains the activations of all (stacked / bidirectional) recurrent
        # layers at the final timestep for each sequence (taking the padding into account).
        # For our classifier, we will stick all of these layers together (forward + backward, 
        # for each stacked layer) to use as the document embedding.
        # all_layers_final_timestep has shape (num_layers, minibatch_size, recurrent_dim);
        # we want something of shape (minibatch_size, num_layers * recurrent_dim),
        # so we reorder the dimensions and then reshape to stick everything together
        minibatch_size = all_layers_final_timestep.size(1)
        doc_embedding = all_layers_final_timestep.permute(1, 0, 2).reshape(minibatch_size, -1)
        
        hidden = self.hidden_layer(doc_embedding)
        output = self.output_layer(hidden)
        return output
        

In [None]:
vocab = build_vocab_from_iterator(tokenized_iter, specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

padding_idx = vocab.get_stoi().get("<pad>")
print("The <pad> symbol in this vocabulary is represented by the index {}".format(padding_idx))

In [1]:
def collate_batch_rnn(batch):
    """Converts a batch of sequence data into padded and packed PyTorch 
    tensor format, and collates the results by label, text, and sequence
    length, for use in a RNN model.
    """
    # Initialize lists that separate out the two components
    label_list = list()
    text_list = list()
    seq_lengths = list()
    
    for data in batch:
        # Convert to PyTorch format
        (label_index, text_indices) = data_to_indices(data)
        # Add converted data to separate component lists
        label_list.append(label_index)
        text_list.append(text_indices)
        seq_lengths.append(len(text_indices))
    
    # Convert to mini-batch tensors
    label_tensor = torch.tensor(label_list, dtype=torch.int64)
    text_tensor = pad_sequence(text_list, batch_first=True, padding_value=padding_idx)
    
    return (label_tensor, text_tensor, seq_lengths)
          
train_dataloader_rnn = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch_rnn)
test_dataloader_rnn = DataLoader(test_data, batch_size=1000, collate_fn=collate_batch_rnn)

NameError: name 'DataLoader' is not defined

In [None]:
# Demonstration of padding
texts = ["I hated it", "it was quite terrible", "i really REALLY loved it"]
print("texts:\n{}\n".format(texts))

text_indices = [text_to_indices(text) for text in texts]
print("Converted to indices:\n{}\n".format(text_indices))

padded = pad_sequence(text_indices, batch_first=True, padding_value=padding_idx)
print("Padded:\n{}".format(padded))

In [None]:
with torch.no_grad(): # This tells PyTorch we aren't going to be doing backprop
    
    # Define simple embedding and recurrent layers.
    # The embedding layer just contains the index of the input
    # The recurrent layer has a bias of 1 and all weights fixed to 1,
    # so it adds together all of the inputs up to the current timestep,
    # plus the number of timesteps since the beginning of the sequence
    embedding_layer = nn.Embedding.from_pretrained(torch.arange(5000, dtype=torch.float32).view(-1, 1), 1, padding_idx=padding_idx)
    recurrent_layer = nn.RNN(1, 1, batch_first=True, nonlinearity="relu")
    recurrent_layer.bias_ih_l0[0] = 0.0
    recurrent_layer.bias_hh_l0[0] = 1.0
    recurrent_layer.weight_ih_l0[0, 0] = 1.0
    recurrent_layer.weight_hh_l0[0, 0] = 1.0

    # Get the word embeddings for the padded sequence
    word_embeddings = embedding_layer(padded)

    # With packing: pack the word embeddings, 
    # run the recurrent layer on the packed padded embeddings, 
    # and then unpack the results
    packed_word_embeddings = pack_padded_sequence(word_embeddings, [3, 4, 5], batch_first=True, enforce_sorted=False)
    (final_layer_all_timesteps, all_layers_final_timestep) = recurrent_layer(packed_word_embeddings)
    final_layer_all_timesteps = pad_packed_sequence(final_layer_all_timesteps, batch_first=True, padding_value=padding_idx)

    print("===========================")
    print("WITH PACKING")
    print("---------------------------")
    print("Recurrent layer activation at all timesteps, for each sequence:")
    print(final_layer_all_timesteps[0].view(3, -1))
    print("\nWhat PyTorch gets as the recurrent layer activation at the \"final\" timestep for each sequence:")
    for (seqnum, value) in enumerate(list(all_layers_final_timestep.view(-1))):
        print("Sequence {}: {}".format(seqnum + 1, int(value)))

    # Without packing: run the recurrent layer on the padded embeddings directly
    (final_layer_all_timesteps, all_layers_final_timestep) = recurrent_layer(word_embeddings)

    print("\n===========================")
    print("WITHOUT PACKING")
    print("---------------------------")
    print("Recurrent layer activation at all timesteps, for each sequence:")
    print(final_layer_all_timesteps.view(3, -1))
    print("\nWhat PyTorch gets as the recurrent layer activation at the \"final\" timestep for each sequence:")
    for (seqnum, value) in enumerate(list(all_layers_final_timestep.view(-1))):
        print("Sequence {}: {}".format(seqnum + 1, int(value)))