# Requirements
Requires an Nvidia GPU to run

Create a new anaconda environment and run the following commands to install the required libraries 
```
conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c conda-forge
conda install gensim
pip install torchdyn
pip install git+https://github.com/google-research/torchsde.git
```

# Citations
- Marcus, Mitchell P., Marcinkiewicz, Mary Ann & Santorini, Beatrice (1993). Building a Large Annotated Corpus of English: The Penn Treebank

```
@article{poli2020torchdyn,
  title={TorchDyn: A Neural Differential Equations Library},
  author={Poli, Michael and Massaroli, Stefano and Yamashita, Atsushi and Asama, Hajime and Park, Jinkyoo},
  journal={arXiv preprint arXiv:2009.09346},
  year={2020}
}
```

- GloVe

- GPT2 paper

- Huggingface for their implementation of transformers? Not sure if this has a paper


# To do
- Consider other variants of Neural ODE
- Implement and see results from my continuous language modelling idea

In [1]:
import torch
import torch.utils.data
import torchtext
import numpy as np
import gensim.downloader as api
from functools import reduce
from sklearn.metrics import *
from time import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Pre Processing
- Build the vocab
- Convert text corpus into padded word vector sequences

To do
- Use LSTM as baseline
    - Examine perplexity of model on validation set
- Implement Neural ODE

In [2]:
# load word embeddings
glove = api.load("glove-wiki-gigaword-300")

KeyboardInterrupt: 

In [4]:
train, valid, test = torchtext.datasets.PennTreebank(split=('train', 'valid', 'test'))
train = list(train) # these are originally iterators, the data is so small we can just retrieve all of it at once
valid = list(valid)
test  = list(test)

In [3]:
# build the vocab
corpus = train + valid
vocab = {"<PAD>": 0}
index_vocab = {0 : "<PAD>"}
for sentence in corpus:
    for token in sentence.split(" ")[1:]:
        if token not in vocab:
            index = len(vocab)
            vocab[token] = index
            index_vocab[index] = token

# replace penn treebank end sentence token "\n" with glove's end sentence token "."
index = vocab["\n"]
vocab.pop("\n")         
vocab["."] = index
index_vocab[index] = "."

# view size
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)

Vocab size:  10001


In [4]:
# pad sentences and convert words to their glove vector to get input features
# convert to 1 hot vocab and shift 1 to the left to get output labels (converting to 1 hot takes too much memory, so just store indices and convert later)
# use left padding, as we want the hidden state at the end (right) to ignore the padding
# returns word_vector_dataset, labels
def preprocess(dataset, sequence_length, wv):
    embedding_size = wv["hello"].shape[0]
    processed = np.zeros((len(dataset), sequence_length, embedding_size))
    labels = np.zeros((len(dataset), sequence_length, 1))
    
    for i in range(len(dataset)):
        tokens = dataset[i].split(" ")[1:]
        
        # get the word vectors for all of the tokens, removing out of vocabulary (OOV) tokens
        tokens_np = np.zeros((len(tokens), embedding_size))
        labels_np = np.zeros((len(tokens), 1))
        j = 0
        for word in tokens:
            if word == "\n": word = "." # replace PennTreebank end sentence token '\n' with glove end sentence token "."
            if word not in wv: continue # ignore OOV tokens
            if j < sequence_length - 1: # only add sequence_length - 1 tokens at max
                # so that there is always a 0 vector at the start so the model learns most common starting words
                tokens_np[j, :] = wv[word]
            # we can look ahead to find the next word to set as the label for the last word
            if j < sequence_length:
                labels_np[j, :] = vocab[word]
            else: break
            j += 1
            
        tokens_np = tokens_np[:j-1, :]
        labels_np = labels_np[:j, :]
        
        # add this sentence to the overall dataset, with left padding of 0 vectors
        processed[i, sequence_length - tokens_np.shape[0]:, :] = tokens_np
        labels[i, sequence_length - labels_np.shape[0]:, :] = labels_np
    return processed, labels

In [6]:
sequence_length = 20
train_X, train_y = preprocess(train, sequence_length, glove)
valid_X, valid_y = preprocess(valid, sequence_length, glove)
test_X , test_y  = preprocess(test,  sequence_length, glove)

In [7]:
# unit test to check the labelling works
assert preprocess(["hello there how are you doing \n"], 20, glove)[1][0][-1] == 25, "Output: {}".format(preprocess(["hello there how are you doing \n"], 20, glove)[1][0])

# LSTM Baseline
Create a baseline RNN and evaluate it's perplexity

In [3]:
class LSTMModel(torch.nn.Module):
    def __init__(self, vocab_size, input_size=100, layer_size=100, dropout=0):
        super().__init__()
        self.LSTM = torch.nn.LSTM(input_size, layer_size, 1, bidirectional=False)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.linear = torch.nn.Linear(layer_size, vocab_size)
        self.softmax = torch.nn.Softmax(dim=-1)
    
    def forward(self, x):
        # convert words to their vectors here
        sequence_outputs, hidden_state = self.LSTM(x)
        sequence_outputs = self.dropout(sequence_outputs)
        pred = self.linear(sequence_outputs)
        return pred
    
    # wrapper function that forward propagates, applies softmax and converts to numpy 
    def predict(self, x):
        preds = self.forward(x)
        preds = self.softmax(preds).detach().cpu().numpy()
        return preds

In [18]:
def numpy_to_tensor(array):
    return torch.from_numpy(array).to(device).float()

In [12]:
model = LSTMModel(vocab_size, input_size=300, layer_size=300, dropout=0.1)
model.to(device)
model.eval()

NameError: name 'LSTMModel' is not defined

In [12]:
%%time
# unit test to check that forward propagation works
data = numpy_to_tensor(train_X[:1000])
print(model.forward(data).shape)

torch.Size([1000, 20, 10001])
Wall time: 642 ms


In [13]:
# clear memory
del data
torch.cuda.empty_cache()

In [44]:
# Define functions to calculate perplexity for a single sentence: see the metric definition here https://web.stanford.edu/~jurafsky/slp3/3.pdf 
# We use teacher forcing (feeding the ground_truth label for sequence i to get pred for sequence i+1) to get the predictions
def perplexity(preds, ground_truth, epsilon=1e-30):
    probs = []
    for i in range(preds.shape[1]):
        probs.append(preds[0, i, int(ground_truth[i])])
    probs = np.array(probs)
    probs = np.power(1/(probs+epsilon), 1/probs.shape[0]) # normalise before taking the product, to prevent underflowing to 0
    return np.prod(probs)

# Calculate overall perplexity for a dataset
def average_perplexity(model, X, y):
    perplexities = [perplexity(model.predict(numpy_to_tensor(X[i:i+1])), y[i]) for i in range(X.shape[0])]
    return np.mean(perplexities)

In [15]:
# training the model
def train_model(model, train_X, train_y, epochs=10, learn_rate=0.01, weight_decay=0.001, minibatch_size=128, print_results=True):
    # Prepare data
    X = numpy_to_tensor(train_X)
    y = numpy_to_tensor(train_y).long()[:, :, 0]
    n_samples = X.shape[0]
    
    # Define loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate, weight_decay=weight_decay)

    # Ensure this runs on gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(epochs):      
        model.train() # set to train flag
        start_ts = time()
        
        # shuffle the data
        new_indices = torch.randperm(n_samples)
        X = X[new_indices, :, :] 
        y = y[new_indices, :]
        
        for batch_n in range(int(np.ceil(n_samples/minibatch_size))):
            # get the minibatch
            start_index = batch_n * minibatch_size
            end_index = min(start_index + minibatch_size, n_samples)
            batch_X = X[start_index: end_index, :, :]
            batch_y = y[start_index: end_index, :]
            
            # forward + backward + optimize
            optimizer.zero_grad()
            outputs = model(batch_X) 
            outputs = torch.swapaxes(outputs, 1, 2) # cross entropy expects a tensor of (n_samples, n_outputs, sequence_length)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
        # evaluate performance on part of the data (for memory reasons we take a subsample)
        if print_results:
            with torch.no_grad():
                model.eval()
                preds = np.argmax(X[:2000, :, :].detach().cpu().numpy(), axis=-1).flatten() # flatten the arrays so accuracy score works
                targets = y[:2000].detach().cpu().numpy().flatten()
                t_perplexity = average_perplexity(model, train_X[:2000], train_y[:2000])
                v_perplexity = average_perplexity(model, valid_X, valid_y)
                end_ts = time()
                print("Epoch {}, Minibatch loss: {:.2f}, Subsample Accuracy: {:.2f}, Train Perplexity: {:.2f}, Validation Perplexity: {:.2f}, Epoch Time: {:.2f} seconds".format(epoch, loss.item(),
                    accuracy_score(targets, preds), t_perplexity, v_perplexity, end_ts - start_ts))
    
    del X
    del y
    torch.cuda.empty_cache()
    if print_results:
        print('Finished Training')
    return model

In [23]:
model = train_model(model, train_X, train_y, epochs=200, learn_rate=0.001, minibatch_size=256, weight_decay=0.00001)

Epoch 0, Minibatch loss: 5.84, Subsample Accuracy: 0.19, Train Perplexity: 5476.68, Validation Perplexity: 5392.49
Epoch 1, Minibatch loss: 5.27, Subsample Accuracy: 0.19, Train Perplexity: 4591.04, Validation Perplexity: 4502.46
Epoch 2, Minibatch loss: 5.18, Subsample Accuracy: 0.18, Train Perplexity: 3781.74, Validation Perplexity: 3725.83
Epoch 3, Minibatch loss: 5.33, Subsample Accuracy: 0.18, Train Perplexity: 3172.56, Validation Perplexity: 3137.97
Epoch 4, Minibatch loss: 5.11, Subsample Accuracy: 0.18, Train Perplexity: 2724.83, Validation Perplexity: 2705.27
Epoch 5, Minibatch loss: 5.23, Subsample Accuracy: 0.19, Train Perplexity: 2389.16, Validation Perplexity: 2385.96
Epoch 6, Minibatch loss: 5.16, Subsample Accuracy: 0.18, Train Perplexity: 2150.90, Validation Perplexity: 2162.16
Epoch 7, Minibatch loss: 4.94, Subsample Accuracy: 0.19, Train Perplexity: 1951.61, Validation Perplexity: 1974.96
Epoch 8, Minibatch loss: 4.84, Subsample Accuracy: 0.19, Train Perplexity: 1790.

KeyboardInterrupt: 

In [18]:
torch.save(model.state_dict(), "LSTMBaseline.model") # save model

In [95]:
# load model, only run if the model hasn't already been trained
model.load_state_dict(torch.load("LSTMBaseline.model"))

<All keys matched successfully>

In [39]:
del model
torch.cuda.empty_cache()

### Hyper-Parameter tuning findings
- glove 300 dimension vectors are essential to not have a bias of 1000 perplexity on both train and validation
- 2 layers of LSTM also gives high bias, perhaps there is not enough train data
- Weight decay is essential in preventing Validation perplexity from skyrocketing
- Dropout of 0.1 combined with weight decay 0.00001 works (around 250 validation perplexity)
- Decreasing learning rate and increasing epochs has a minor benefit

### Examine Performance of the model
- Using both perplexity and qualitative evaluation

In [71]:
# view overall performance
model.eval()
data = numpy_to_tensor(train_X[:1000])
preds = model.predict(data)
#preds = torch.nn.Softmax(dim=-1)(preds).detach().cpu().numpy()

In [72]:
list(map(lambda x: index_vocab[x], np.argsort(preds[1, 11, :])[-10:]))

['probably',
 'help',
 'begin',
 'also',
 'take',
 'make',
 'continue',
 'the',
 'have',
 'be']

In [73]:
np.sort(preds[1][0])

array([5.1733707e-07, 5.4392586e-07, 5.5573935e-07, ..., 1.3860598e-02,
       3.9391726e-02, 7.3702824e-01], dtype=float32)

In [75]:
preds_int = np.argmax(preds, axis=-1)
for i in range(10):
    sentence = list(map(index_vocab.get, preds_int[i]))
    truth = list(map(lambda x: index_vocab[int(x)], train_y[i]))
    input_sentence = ['<PAD>'] + truth
    print("Predicted:", sentence)
    print("Input:    ",input_sentence)
    print()

Predicted: ['<PAD>', '<PAD>', 'securities', "'s", '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '.', '.', '<PAD>', '<PAD>', '<PAD>', 'the', '<PAD>', '<PAD>', '<PAD>', "'s", '<PAD>']
Input:     ['<PAD>', 'aer', 'banknote', 'berlitz', 'calloway', 'cluett', 'fromstein', 'gitano', 'guterman', 'ipo', 'kia', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'ssangyong', 'swapo', 'wachter']

Predicted: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'the', '.', '.', 'be', 'the', 'company', "'s", 'a', 'new', 'director', 'of', '.']
Input:     ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'pierre', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '.']

Predicted: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'said', 'a', 'of', 'the', 'inc.', 'company', "'s", 'group', '.']
Input:     ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 

In [76]:
train[:10]

[' aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter \n',
 ' pierre <unk> N years old will join the board as a nonexecutive director nov. N \n',
 ' mr. <unk> is chairman of <unk> n.v. the dutch publishing group \n',
 ' rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate \n',
 ' a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported \n',
 ' the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said \n',
 ' <unk> inc. the unit of new york-based <unk> corp. that makes kent cigarettes stopped using <unk> in its <unk> cigarette filters in

In [77]:
pred1 = model.predict(numpy_to_tensor(valid_X[0:1]))
perplexity(pred1, valid_y[0])

80.610664

In [78]:
average_perplexity(model, train_X, train_y)

160.79262

In [79]:
average_perplexity(model, valid_X, valid_y)

245.91684

# GPT2 Baseline
Implement GPT2 as a language modelling baseline. GPT-3 is not publicly available and too large for practical purposes. BERT needs modification to work for language modelling, due to the fact that it is trained for bidirectional masked language modelling instead.

This section makes use of several tutorials for fine tuning, including:
- https://reyfarhan.com/posts/easy-gpt2-finetuning-huggingface/
- https://mccormickml.com/2019/07/22/BERT-fine-tuning/#4-train-our-classification-model
- https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel (the documentation)
- https://huggingface.co/transformers/custom_datasets.html

In [2]:
# load　GPT, BERT and support materials from huggingface
# requires pip install transformers
# if in jupyter notebook see here and you get an error mention ipython widgets see here: 
# https://stackoverflow.com/questions/53247985/tqdm-4-28-1-in-jupyter-notebook-intprogress-not-found-please-update-jupyter-an
from transformers import GPT2Tokenizer, GPT2LMHeadModel, top_k_top_p_filtering, Trainer, TrainingArguments

In [3]:
train, valid, test = torchtext.datasets.PennTreebank(split=('train', 'valid', 'test'))
train = list(train) # these are originally iterators, the data is so small we can just retrieve all of it at once
valid = list(valid)
test  = list(test)

In [4]:
# Download the models
# Documentation for GPT: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

### Experimenting

In [5]:
# GPT2 example generation
text = f"Hugging Face is based in DUMBO, New York City, and"
tokens_tensor = gpt_tokenizer.encode(text, return_tensors="pt")

# Put everything on cuda
gpt_model.eval()
tokens_tensor = tokens_tensor.to('cuda')
gpt_model.to('cuda')

# Predict all tokens
generated = tokens_tensor
for i in range(20):
    next_token_logits = gpt_model(generated).logits[:, -1, :]
    # filter
    filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
    # sample
    probs = torch.nn.functional.softmax(filtered_next_token_logits, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)
    generated = torch.cat([generated, next_token], dim=-1)

resulting_string = gpt_tokenizer.decode(generated.tolist()[0])
resulting_string

'Hugging Face is based in DUMBO, New York City, and focuses on two characters who are either dead or are alive, the dead being an infant in its stomach'

In [6]:
tokens = gpt_tokenizer.encode(train[3])
print([gpt_tokenizer.decode([x]) for x in tokens])

[' rud', 'olph', ' <', 'unk', '>', ' N', ' years', ' old', ' and', ' former', ' chairman', ' of', ' consolidated', ' gold', ' fields', ' pl', 'c', ' was', ' named', ' a', ' nonex', 'ec', 'utive', ' director', ' of', ' this', ' b', 'rit', 'ish', ' industrial', ' conglomerate', ' ', '\n']


We can see that the gpt_tokenizer works differently to ours, splitting up names such as 'rudolph' into 'rud' and 'olph' and words such as nonexecutive and british. Hence our perplexity evaluation will have to be slightly different, using gpt_tokenizer to get the ground truth labels

### Data Processing

In [7]:
# Define a dataset class for fine-tuning, it's a generator so we don't have to store the entire dataset in memory
class GPT2Dataset(torch.utils.data.Dataset):
    def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=40):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        # Encode all the text, padding and truncuating it along with adding attention masks to get the sequence length the same across all samples
        for txt in txt_list:
            encodings_dict = tokenizer.encode_plus('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(encodings_dict['input_ids'])
            self.attn_masks.append(encodings_dict['attention_mask'])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        # The tutorial use a dictionary format that also stores labels 
        return_dict = {"input_ids": torch.tensor(self.input_ids[idx]),
                       "attention_mask": torch.tensor(self.attn_masks[idx]), 
                       "labels": torch.tensor(self.input_ids[idx])} 
        return return_dict

In [8]:
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token # set the pad token
gpt_sequence_length = 40 # gpt splits up words into smaller tokens, so the sequence length should be longer
train_dataset = GPT2Dataset(train, gpt_tokenizer, max_length=gpt_sequence_length)
val_dataset = GPT2Dataset(valid, gpt_tokenizer, max_length=gpt_sequence_length)
test_dataset = GPT2Dataset(test, gpt_tokenizer, max_length=gpt_sequence_length)
train_dataset[1]

{'input_ids': tensor([   27,    91,  9688,  1659,  5239,    91,    29, 17748,   260,  1279,
          2954,    29,   399,   812,  1468,   481,  4654,   262,  3096,   355,
           257, 36196,   721,  8827,  3437,   645,    85,    13,   399,   220,
           198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([   27,    91,  9688,  1659,  5239,    91,    29, 17748,   260,  1279,
          2954,    29,   399,   812,  1468,   481,  4654,   262,  3096,   355,
           257, 36196,   721,  8827,  3437,   645,    85,    13,   399,   220,
           198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])}

In [9]:
# get output by passing the ids and the attention mask
gpt_model(input_ids=train_dataset[1]['input_ids'].to(device), attention_mask=train_dataset[1]['attention_mask'].to(device), use_cache=False)

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[-31.7761, -30.6999, -32.1561,  ..., -39.5131, -39.7474, -31.6402],
        [-63.3376, -60.9081, -61.4334,  ..., -72.3227, -72.0701, -62.4869],
        [-53.8055, -53.4089, -53.4207,  ..., -63.6040, -62.0605, -54.8142],
        ...,
        [-82.7951, -76.3731, -78.5615,  ..., -95.3905, -96.0068, -84.1115],
        [-82.7967, -76.3781, -78.5659,  ..., -95.3878, -96.0029, -84.1119],
        [-82.8332, -76.4163, -78.6067,  ..., -95.4225, -96.0345, -84.1454]],
       device='cuda:0', grad_fn=<MmBackward>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [10]:
gpt_tokenizer.decode(train_dataset[5]['input_ids'])

'<|startoftext|> the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said'

### Fine Tuning

Do fine tuning of the gpt_model using the hugging face out of the box trainer https://huggingface.co/transformers/custom_datasets.html#fine-tuning-with-trainerfrom 

In [11]:
training_args = TrainingArguments(
    output_dir='gpt_finetuning',     # output directory
    num_train_epochs=1,              # total number of training epochs (1 is enough to get very low perplexity and perplexity increases at 2)
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    logging_dir='gpt_finetuning_logs',            # directory for storing logs
    logging_steps=100,
)

trainer = Trainer(
    model=gpt_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss
100,5.0793
200,2.6275
300,2.4378
400,2.402
500,2.3207
600,2.2569
700,2.2086
800,2.2025
900,2.2035
1000,2.1592


TrainOutput(global_step=2630, training_loss=2.287749967103675, metrics={'train_runtime': 254.6267, 'train_samples_per_second': 10.329, 'total_flos': 61761965506560.0, 'epoch': 1.0})

### Model Definition
Define a wrapper model that can use GPT2 both for standard next word prediction and language generation

In [12]:
# Build a model wrapper for gpt2 that uses the "past" variable and for language modelling
# TODO: add the options for beam search
class GPTModel(torch.nn.Module):
    def __init__(self, model=None, sequence_length=20):
        super().__init__()
        self.gpt = model.to(device)
        self.tokenizer = gpt_tokenizer
        self.sequence_length = sequence_length
        self.vocab_size = self.tokenizer.vocab_size
    
    # output the logits for the most likely next word at each position in the sentence and optionally the hidden states (used for the Neural ODE) 
    # note input_dataset must be an element taken from a GPT2Dataset class (e.g. train_dataset[0])
    def forward(self, input_dataset, output_hidden_states=False):
        output = self.gpt.forward(input_ids = input_dataset['input_ids'].to(device), 
                                  attention_mask=input_dataset['attention_mask'].to(device),
                                  use_cache=False,
                                  output_hidden_states = output_hidden_states)
        if output_hidden_states:
            return output["hidden_states"]
        return output["logits"]
    
    # take in a sentence and output the predictions as in forward, but as the most likely sentence not logits
    def forward_sentence(self, input_dataset):
        preds = self.forward(input_dataset)
        tokens = torch.argmax(preds, dim=-1)
        return self.tokenizer.decode(tokens)
    
    # generate a sentence by sampling the next word from the probability distribution
    # set limit to an integer to generate `limit` number of words instead of ending at a full stop
    def random_gen(self, x, limit=None):
        # initialize variables
        generated = self.tokenizer.encode_plus(x, return_tensors="pt")['input_ids'].to('cuda')
        next_token = [generated[0][-1]]
        past = None
        raw_output= None
        
        # generate until a "." is generated
        while (limit is None and self.tokenizer.decode(next_token[0]) not in [".", "?", "!"]) or (limit is not None and len(generated[0]) < limit):
            # get output of model, using past if available
            if past is None:
                raw_output = self.gpt(generated, past_key_values=past)
            else:
                raw_output = self.gpt(next_token, past_key_values=past)
            output, past = raw_output['logits'], raw_output['past_key_values']
            next_token_logits = output[:, -1, :]
            
            # sample a token from the top 50 most likely words
            filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0) # filter to the top 50 tokens
            probs = torch.nn.functional.softmax(filtered_next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=-1)
            
        return self.tokenizer.decode(generated[0])
    
    # do beam_search to find the most likely sentence
    def beam_search(self, x, beam=5): 
        pass

In [13]:
modelgpt = GPTModel(model=gpt_model, sequence_length=gpt_sequence_length)
modelgpt.eval()
modelgpt.forward(train_dataset[0])

tensor([[-1.6970e+02, -1.6412e+02, -1.6493e+02,  ..., -2.0270e+02,
         -2.0474e+02, -1.6856e+02],
        [-2.6901e+00, -1.6066e+00,  4.1629e-01,  ..., -9.9957e+00,
         -1.0583e+01,  7.4749e-02],
        [-2.3482e+01, -2.3870e+01, -2.1071e+01,  ..., -4.0187e+01,
         -3.3782e+01, -2.0225e+01],
        ...,
        [-7.0412e+01, -7.0517e+01, -7.2410e+01,  ..., -7.6573e+01,
         -7.6298e+01, -6.6031e+01],
        [-6.7842e+01, -6.6881e+01, -6.5982e+01,  ..., -7.9229e+01,
         -7.9465e+01, -6.2473e+01],
        [-6.4527e+01, -6.3917e+01, -6.5069e+01,  ..., -7.2385e+01,
         -7.2153e+01, -6.0531e+01]], device='cuda:0', grad_fn=<MmBackward>)

View the next word output for a single example

### Evaluation

In [25]:
i = 18
input_t = [gpt_tokenizer.decode(t) for t in train_dataset[i]['input_ids']]
preds = modelgpt.forward(train_dataset[i])
tokens = torch.argmax(preds, dim=-1)
output_t = [gpt_tokenizer.decode(t) for t in tokens]
print("The format is:")
print("(Ground Truth word, Predicted next word),")
list(zip(input_t, output_t))

The format is:
(Ground Truth word, Predicted next word),


[('<', '\n'),
 ('|', ' |'),
 ('start', '|'),
 ('of', '|'),
 ('text', '|'),
 ('|', '>'),
 ('>', '\n'),
 (' the', ' first'),
 (' total', ' number'),
 (' of', ' the'),
 (' N', '.'),
 (' deaths', ' in'),
 (' from', ' all'),
 (' mal', 'ignant'),
 ('ignant', ' cancer'),
 (' <', '|'),
 ('unk', '>'),
 ('>', ' cancer'),
 (' lung', ' cancer'),
 (' cancer', '</'),
 (' and', ' <'),
 (' <', 'unk'),
 ('unk', '>'),
 ('>', ' lung'),
 (' was', ' <'),
 (' far', ' higher'),
 (' higher', ' than'),
 (' than', ' the'),
 (' expected', '.'),
 (' the', ' previous'),
 (' researchers', ' said'),
 (' said', '.'),
 (' ', '\xa0'),
 ('\n', '\n'),
 ('<|endoftext|>', 'The'),
 ('<|endoftext|>', 'The'),
 ('<|endoftext|>', 'The'),
 ('<|endoftext|>', 'The'),
 ('<|endoftext|>', 'The'),
 ('<|endoftext|>', 'The')]

View the ability to generate without teacher forcing using the random_gen() function

In [26]:
with torch.no_grad():
    result = modelgpt.random_gen("A new study says", limit=40)
result

'A new study says the number of people dying because there is little or no way for people to escape poverty has increased by nearly 500 percent since 2009, the period ending with 2014. And that is according'

In [27]:
# The formula for calculating perplexity in language models can be found here: https://web.stanford.edu/~jurafsky/slp3/3.pdf (page 8)
# An interesting detail is that the geometric mean of perplexity from each word is used
# if the mask is 0 at index i don't use the value at index i to calculate perplexity
def perplexity_gpt(preds, ground_truth, mask, epsilon=1e-30):
    probs = []
    for i in range(preds.shape[0]):
        if mask[i] != 0:
            probs.append(preds[i, int(ground_truth[i])])
    probs = np.array(probs)
    probs = np.power(1/(probs+epsilon), 1/probs.shape[0]) # normalise before taking the product, to prevent underflowing to 0
    return np.prod(probs).detach().cpu().numpy()

# Can optionally define n_samples=int to limit the number of samples used for perplexity evaluation
def average_perplexity_gpt(model, train, n_samples=None, print_results=False):
    perplexities = []
    n_samples = len(train) if n_samples is None else n_samples
    with torch.no_grad():
        for i in range(n_samples):
            # Compute perplexity for a single sample
            labels = train[i]['input_ids'][1:]
            mask = train[i]['attention_mask'][:-1]
            preds = model.forward(train[i])[:-1] # remove the last prediction as there is no ground truth 
            preds = torch.nn.functional.softmax(preds, dim=-1)
            perplexities.append(perplexity_gpt(preds[6:], labels[6:], mask[6:])) # remove the first 7 tokens that represent "<|startoftext|>"

            if i % 100 == 0 and print_results:
                print("Sentences analysed: {} Average perplexity: {}".format(i, np.mean(perplexities)))
    return np.mean(perplexities)

In [28]:
average_perplexity_gpt(modelgpt, train_dataset, n_samples=None, print_results=False)

KeyboardInterrupt: 

In [93]:
average_perplexity_gpt(modelgpt, val_dataset, n_samples=None, print_results=False)

26.49069

In [94]:
average_perplexity_gpt(modelgpt, test_dataset, n_samples=None, print_results=False)

23.95521

GPT2 Results
- No fine tuning: 706.16187 perplexity on the training set
- Fine tuning: 21.57 train, 26.49 validation, 23.95 testing    
    - The near SOTA perplexity (higher than the current best for GPT2 https://paperswithcode.com/sota/language-modelling-on-penn-treebank-word ) is due to the fact that the GPT tokenizer splits up the some words into smaller pieces e.g. "british" becomes: \[' b', 'rit', 'ish'\] and it's easier to predict those smaller pieces

# Neural ODE Model
Results
- LSTM as a hidden state: perplexity 651.04. It wasn't fine tuned but it's clear we need better

In [11]:
from torchdyn.models import *
from torchdyn import *
import pytorch_lightning as pl

In [12]:
# Build a wrapper for gpt that takes a torch.util.data.TensorDataset as input, needed for pytorch lightning
class GPTModelWrapper(torch.nn.Module):
    def __init__(self, model=None, sequence_length=40):
        super().__init__()
        self.gpt = model.to(device)
        self.tokenizer = gpt_tokenizer
        self.sequence_length = sequence_length
        self.vocab_size = self.tokenizer.vocab_size
    
    # output the hidden states for the entire sequence used for the Neural ODE
    def forward(self, input_dataset):
        output = self.gpt.forward(input_ids = input_dataset[0].to(device), 
                                  attention_mask=input_dataset[1].to(device),
                                  use_cache=False,
                                  output_hidden_states=True)
        return output["hidden_states"]

In [13]:
# Defines an ODE that uses a GPT to get a representation for the sentence
class ODEGPT(pl.LightningModule):
    def __init__(self, modelgpt, sequence_length=40):
        super().__init__()
        layer_size = 768 # the size of gpt's hidden state
        self.loss = torch.nn.CrossEntropyLoss()
        
        # Freeze the GPT model's parameters to save training time
        self.modelgpt = modelgpt
        for param in self.modelgpt.parameters():
            param.requires_grad = False
        
        # Define the derivative function
        self.f = torch.nn.Sequential(
            torch.nn.Linear(layer_size, layer_size),
            torch.nn.ReLU(),
            torch.nn.Linear(layer_size, layer_size),
        )
        
        # Define the model itself
        self.node = NeuralDE(self.f, sensitivity='adjoint', solver='dopri5').to(device)
        self.linear = torch.nn.Linear(layer_size, self.modelgpt.vocab_size).to(device)
        self.softmax = torch.nn.Softmax(dim=-1)
        self.sequence_length = sequence_length
        self.timesteps = torch.arange(0, 40, 1, device=device).float() # define the number of output items of the Neural ODE
    
    # take in a single sample and feed forward, giving the logits as output
    # note x must be an element of a GPT2Dataset class so that it can be fed to the GPT model
    def forward(self, x):
        # at the moment this feeds the entire sequence to LSTM and asks Neural ODE to reproduce it
        # TODO: switch to feeding half the sequence and asking NeuralODE to extrapolate
        hidden_states = self.modelgpt(x) 
        attention_mask = x[1].to(device)[0, :] # batching makes x[1] have a shape of (batch_size, features), we use batches of 1 so take the first
        
        # use the output of GPT2's 12th decoder, "BERT Rediscovers the Classical NLP Pipeline" has shown transformers' later layers represent high level meaning, which is 
        # what we want to input to the Neural ODE
        # TODO: Perhaps consider the above paper's method of having a weighted sum of layers representations, with trainable weights
        final_hidden = hidden_states[12] 
        final_hidden = final_hidden[0, attention_mask, :][-1, :] # Take the output of the last sequence item that isn't a pad token

        # feed to neural ode
        sequence_outputs = self.node.trajectory(final_hidden, self.timesteps) # output is of shape (sequence_length, gpt_hidden_layer_size)
        
        # Get final output
        pred = self.linear(sequence_outputs)
        return pred
    
    # compute the loss on a batch, required by pytorch lightning
    # note the batch must be an element of a tf.utils.data.TensorDataset, this function is only meant to be used with pytorch_lightning's training loop
    def training_step(self, batch, batch_idx):
        labels = batch[2][0, 1:].to(device) # shift the input 1 step ahead to get the next word labels
        preds = self.forward(batch)[:-1, :] # remove the prediction for the last token as there is no label
        loss = self.loss(preds, labels) # crossentropy loss expects preds to be of size (batch, n_classes) so it handles our sequence model use case
        return loss
    
    # configure the optimizer for pytorch lightning
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.00001, betas=(0.95, 0.999)) # low learning rate and momentum since this is stochastic optimisation
    
    # wrapper function that forward propagates, applies softmax and converts to numpy 
    def predict(self, x):
        preds = self.forward(x)
        preds = self.softmax(preds).detach().cpu().numpy()
        return preds

In [14]:
gptmodel_wrapper = GPTModelWrapper(gpt_model)
odemodel = ODEGPT(gptmodel_wrapper)

Errors: setting num_workers = 1 causes DataLoader to hang. Setting num_workers = 0 causes a random CUDA error that doesn't happen in the above functions

### Training
Use the pytorch lightning's training loop to speed up training. 

Important documentation
- https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#training
- https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api

In [15]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [16]:
# Load the data into a new dataset, pytorch_lightning doesn't like our custom dataset
full_dataset = train_dataset[:]
train_tensor_dataset = torch.utils.data.TensorDataset(full_dataset['input_ids'], full_dataset['attention_mask'], full_dataset['labels'])
train_dataloader = torch.utils.data.DataLoader(train_tensor_dataset, batch_size=1, shuffle=True,
                             num_workers=16, pin_memory=True)

# Test run to check for errors
for batch in train_dataloader:
    print(odemodel.forward(batch).shape)
    print(odemodel.training_step(batch, 0))
    break

torch.Size([40, 50257])
tensor(3239.3477, device='cuda:0', grad_fn=<NllLossBackward>)


In [17]:
# Train the model
trainer = pl.Trainer(max_epochs=1, gpus=1, progress_bar_refresh_rate=10)
trainer.fit(odemodel, train_dataloader)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.core.lightning:
  | Name     | Type             | Params
----------------------------------------------
0 | loss     | CrossEntropyLoss | 0     
1 | modelgpt | GPTModelWrapper  | 124 M 
2 | f        | Sequential       | 1.2 M 
3 | node     | NeuralDE         | 1.2 M 
4 | linear   | Linear           | 38.6 M
5 | softmax  | Softmax          | 0     
----------------------------------------------
39.8 M    Trainable params
124 M     Non-trainable params
164 M     Total params
657.074   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]



Maybe turn off the dataloader parameters ?

In [None]:
help(train_dataloader)