# Requirements
Requires an Nvidia GPU to run

Create a new anaconda environment and run the following commands to install the required libraries 
```
conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c conda-forge
conda install gensim
pip install torchdyn
pip install git+https://github.com/google-research/torchsde.git
```

# Citations
- Marcus, Mitchell P., Marcinkiewicz, Mary Ann & Santorini, Beatrice (1993). Building a Large Annotated Corpus of English: The Penn Treebank

```
@article{poli2020torchdyn,
  title={TorchDyn: A Neural Differential Equations Library},
  author={Poli, Michael and Massaroli, Stefano and Yamashita, Atsushi and Asama, Hajime and Park, Jinkyoo},
  journal={arXiv preprint arXiv:2009.09346},
  year={2020}
}
```

- GloVe

- GPT2 paper

- Huggingface for their implementation of transformers? Not sure if this has a paper


# To do
- Use BERT as a baseline and possibly an encoder
- Examine mini batch calculation
- Use LSTM function 
- Consider other variants of Neural ODE
- Implement and see results from my continuous language modelling idea

In [1]:
import torch
import torch.utils.data
import torchtext
import numpy as np
import gensim.downloader as api
from functools import reduce
from sklearn.metrics import *
from time import time

# Pre Processing
- Build the vocab
- Convert text corpus into padded word vector sequences

To do
- Use LSTM as baseline
    - Examine perplexity of model on validation set
- Implement Neural ODE

In [2]:
# load word embeddings
glove = api.load("glove-wiki-gigaword-300")

In [2]:
train, valid, test = torchtext.datasets.PennTreebank(split=('train', 'valid', 'test'))
train = list(train) # these are originally iterators, the data is so small we can just retrieve all of it at once
valid = list(valid)
test  = list(test)

In [3]:
# build the vocab
corpus = train + valid
vocab = {"<PAD>": 0}
index_vocab = {0 : "<PAD>"}
for sentence in corpus:
    for token in sentence.split(" ")[1:]:
        if token not in vocab:
            index = len(vocab)
            vocab[token] = index
            index_vocab[index] = token

# replace penn treebank end sentence token "\n" with glove's end sentence token "."
index = vocab["\n"]
vocab.pop("\n")         
vocab["."] = index
index_vocab[index] = "."

# view size
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)

Vocab size:  10001


In [4]:
# pad sentences and convert words to their glove vector to get input features
# convert to 1 hot vocab and shift 1 to the left to get output labels (converting to 1 hot takes too much memory, so just store indices and convert later)
# use left padding, as we want the hidden state at the end (right) to ignore the padding
# returns word_vector_dataset, labels
def preprocess(dataset, sequence_length, wv):
    embedding_size = wv["hello"].shape[0]
    processed = np.zeros((len(dataset), sequence_length, embedding_size))
    labels = np.zeros((len(dataset), sequence_length, 1))
    
    for i in range(len(dataset)):
        tokens = dataset[i].split(" ")[1:]
        
        # get the word vectors for all of the tokens, removing out of vocabulary (OOV) tokens
        tokens_np = np.zeros((len(tokens), embedding_size))
        labels_np = np.zeros((len(tokens), 1))
        j = 0
        for word in tokens:
            if word == "\n": word = "." # replace PennTreebank end sentence token '\n' with glove end sentence token "."
            if word not in wv: continue # ignore OOV tokens
            if j < sequence_length - 1: # only add sequence_length - 1 tokens at max
                # so that there is always a 0 vector at the start so the model learns most common starting words
                tokens_np[j, :] = wv[word]
            # we can look ahead to find the next word to set as the label for the last word
            if j < sequence_length:
                labels_np[j, :] = vocab[word]
            else: break
            j += 1
            
        tokens_np = tokens_np[:j-1, :]
        labels_np = labels_np[:j, :]
        
        # add this sentence to the overall dataset, with left padding of 0 vectors
        processed[i, sequence_length - tokens_np.shape[0]:, :] = tokens_np
        labels[i, sequence_length - labels_np.shape[0]:, :] = labels_np
    return processed, labels

In [6]:
sequence_length = 20
train_X, train_y = preprocess(train, sequence_length, glove)
valid_X, valid_y = preprocess(valid, sequence_length, glove)
test_X , test_y  = preprocess(test,  sequence_length, glove)

In [7]:
# unit test to check the labelling works
assert preprocess(["hello there how are you doing \n"], 20, glove)[1][0][-1] == 25, "Output: {}".format(preprocess(["hello there how are you doing \n"], 20, glove)[1][0])

# LSTM Baseline
Create a baseline RNN and evaluate it's perplexity

In [3]:
class LSTMModel(torch.nn.Module):
    def __init__(self, vocab_size, input_size=100, layer_size=100, dropout=0):
        super().__init__()
        self.LSTM = torch.nn.LSTM(input_size, layer_size, 1, bidirectional=False)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.linear = torch.nn.Linear(layer_size, vocab_size)
        self.softmax = torch.nn.Softmax(dim=-1)
    
    def forward(self, x):
        # convert words to their vectors here
        sequence_outputs, hidden_state = self.LSTM(x)
        sequence_outputs = self.dropout(sequence_outputs)
        pred = self.linear(sequence_outputs)
        return pred
    
    # wrapper function that forward propagates, applies softmax and converts to numpy 
    def predict(self, x):
        preds = self.forward(x)
        preds = self.softmax(preds).detach().cpu().numpy()
        return preds

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def numpy_to_tensor(array):
    return torch.from_numpy(array).to(device).float()

In [5]:
model = LSTMModel(vocab_size, input_size=300, layer_size=300, dropout=0.1)
model.to(device)
model.eval()

NameError: name 'vocab_size' is not defined

In [12]:
%%time
# unit test to check that forward propagation works
data = numpy_to_tensor(train_X[:1000])
print(model.forward(data).shape)

torch.Size([1000, 20, 10001])
Wall time: 642 ms


In [13]:
# clear memory
del data
torch.cuda.empty_cache()

In [44]:
# Define functions to calculate perplexity for a single sentence: see the metric definition here https://web.stanford.edu/~jurafsky/slp3/3.pdf 
# We use teacher forcing (feeding the ground_truth label for sequence i to get pred for sequence i+1) to get the predictions
def perplexity(preds, ground_truth, epsilon=1e-30):
    probs = []
    for i in range(preds.shape[1]):
        probs.append(preds[0, i, int(ground_truth[i])])
    probs = np.array(probs)
    probs = np.power(1/(probs+epsilon), 1/probs.shape[0]) # normalise before taking the product, to prevent underflowing to 0
    return np.prod(probs)

# Calculate overall perplexity for a dataset
def average_perplexity(model, X, y):
    perplexities = [perplexity(model.predict(numpy_to_tensor(X[i:i+1])), y[i]) for i in range(X.shape[0])]
    return np.mean(perplexities)

In [15]:
# training the model
def train_model(model, train_X, train_y, epochs=10, learn_rate=0.01, weight_decay=0.001, minibatch_size=128, print_results=True):
    # Prepare data
    X = numpy_to_tensor(train_X)
    y = numpy_to_tensor(train_y).long()[:, :, 0]
    n_samples = X.shape[0]
    
    # Define loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate, weight_decay=weight_decay)

    # Ensure this runs on gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(epochs):      
        model.train() # set to train flag
        start_ts = time()
        
        # shuffle the data
        new_indices = torch.randperm(n_samples)
        X = X[new_indices, :, :] 
        y = y[new_indices, :]
        
        for batch_n in range(int(np.ceil(n_samples/minibatch_size))):
            # get the minibatch
            start_index = batch_n * minibatch_size
            end_index = min(start_index + minibatch_size, n_samples)
            batch_X = X[start_index: end_index, :, :]
            batch_y = y[start_index: end_index, :]
            
            # forward + backward + optimize
            optimizer.zero_grad()
            outputs = model(batch_X) 
            outputs = torch.swapaxes(outputs, 1, 2) # cross entropy expects a tensor of (n_samples, n_outputs, sequence_length)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
        # evaluate performance on part of the data (for memory reasons we take a subsample)
        if print_results:
            with torch.no_grad():
                model.eval()
                preds = np.argmax(X[:2000, :, :].detach().cpu().numpy(), axis=-1).flatten() # flatten the arrays so accuracy score works
                targets = y[:2000].detach().cpu().numpy().flatten()
                t_perplexity = average_perplexity(model, train_X[:2000], train_y[:2000])
                v_perplexity = average_perplexity(model, valid_X, valid_y)
                end_ts = time()
                print("Epoch {}, Minibatch loss: {:.2f}, Subsample Accuracy: {:.2f}, Train Perplexity: {:.2f}, Validation Perplexity: {:.2f}, Epoch Time: {:.2f} seconds".format(epoch, loss.item(),
                    accuracy_score(targets, preds), t_perplexity, v_perplexity, end_ts - start_ts))
    
    del X
    del y
    torch.cuda.empty_cache()
    if print_results:
        print('Finished Training')
    return model

In [23]:
model = train_model(model, train_X, train_y, epochs=200, learn_rate=0.001, minibatch_size=256, weight_decay=0.00001)

Epoch 0, Minibatch loss: 5.84, Subsample Accuracy: 0.19, Train Perplexity: 5476.68, Validation Perplexity: 5392.49
Epoch 1, Minibatch loss: 5.27, Subsample Accuracy: 0.19, Train Perplexity: 4591.04, Validation Perplexity: 4502.46
Epoch 2, Minibatch loss: 5.18, Subsample Accuracy: 0.18, Train Perplexity: 3781.74, Validation Perplexity: 3725.83
Epoch 3, Minibatch loss: 5.33, Subsample Accuracy: 0.18, Train Perplexity: 3172.56, Validation Perplexity: 3137.97
Epoch 4, Minibatch loss: 5.11, Subsample Accuracy: 0.18, Train Perplexity: 2724.83, Validation Perplexity: 2705.27
Epoch 5, Minibatch loss: 5.23, Subsample Accuracy: 0.19, Train Perplexity: 2389.16, Validation Perplexity: 2385.96
Epoch 6, Minibatch loss: 5.16, Subsample Accuracy: 0.18, Train Perplexity: 2150.90, Validation Perplexity: 2162.16
Epoch 7, Minibatch loss: 4.94, Subsample Accuracy: 0.19, Train Perplexity: 1951.61, Validation Perplexity: 1974.96
Epoch 8, Minibatch loss: 4.84, Subsample Accuracy: 0.19, Train Perplexity: 1790.

KeyboardInterrupt: 

In [18]:
torch.save(model.state_dict(), "LSTMBaseline.model") # save model

In [95]:
# load model, only run if the model hasn't already been trained
model.load_state_dict(torch.load("LSTMBaseline.model"))

<All keys matched successfully>

In [39]:
del model
torch.cuda.empty_cache()

### Hyper-Parameter tuning findings
- glove 300 dimension vectors are essential to not have a bias of 1000 perplexity on both train and validation
- 2 layers of LSTM also gives high bias, perhaps there is not enough train data
- Weight decay is essential in preventing Validation perplexity from skyrocketing
- Dropout of 0.1 combined with weight decay 0.00001 works (around 250 validation perplexity)
- Decreasing learning rate and increasing epochs has a minor benefit

### Examine Performance of the model
- Using both perplexity and qualitative evaluation

In [71]:
# view overall performance
model.eval()
data = numpy_to_tensor(train_X[:1000])
preds = model.predict(data)
#preds = torch.nn.Softmax(dim=-1)(preds).detach().cpu().numpy()

In [72]:
list(map(lambda x: index_vocab[x], np.argsort(preds[1, 11, :])[-10:]))

['probably',
 'help',
 'begin',
 'also',
 'take',
 'make',
 'continue',
 'the',
 'have',
 'be']

In [73]:
np.sort(preds[1][0])

array([5.1733707e-07, 5.4392586e-07, 5.5573935e-07, ..., 1.3860598e-02,
       3.9391726e-02, 7.3702824e-01], dtype=float32)

In [75]:
preds_int = np.argmax(preds, axis=-1)
for i in range(10):
    sentence = list(map(index_vocab.get, preds_int[i]))
    truth = list(map(lambda x: index_vocab[int(x)], train_y[i]))
    input_sentence = ['<PAD>'] + truth
    print("Predicted:", sentence)
    print("Input:    ",input_sentence)
    print()

Predicted: ['<PAD>', '<PAD>', 'securities', "'s", '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '.', '.', '<PAD>', '<PAD>', '<PAD>', 'the', '<PAD>', '<PAD>', '<PAD>', "'s", '<PAD>']
Input:     ['<PAD>', 'aer', 'banknote', 'berlitz', 'calloway', 'cluett', 'fromstein', 'gitano', 'guterman', 'ipo', 'kia', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'ssangyong', 'swapo', 'wachter']

Predicted: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'the', '.', '.', 'be', 'the', 'company', "'s", 'a', 'new', 'director', 'of', '.']
Input:     ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'pierre', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '.']

Predicted: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'said', 'a', 'of', 'the', 'inc.', 'company', "'s", 'group', '.']
Input:     ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 

In [76]:
train[:10]

[' aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter \n',
 ' pierre <unk> N years old will join the board as a nonexecutive director nov. N \n',
 ' mr. <unk> is chairman of <unk> n.v. the dutch publishing group \n',
 ' rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate \n',
 ' a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported \n',
 ' the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said \n',
 ' <unk> inc. the unit of new york-based <unk> corp. that makes kent cigarettes stopped using <unk> in its <unk> cigarette filters in

In [77]:
pred1 = model.predict(numpy_to_tensor(valid_X[0:1]))
perplexity(pred1, valid_y[0])

80.610664

In [78]:
average_perplexity(model, train_X, train_y)

160.79262

In [79]:
average_perplexity(model, valid_X, valid_y)

245.91684

# GPT2 Baseline
Implement GPT2 as a language modelling baseline. GPT-3 is not publicly available and too large for practical purposes. BERT needs modification to work for language modelling, due to the fact that it is trained for bidirectional masked language modelling instead.

This section makes use of several tutorials for fine tuning, including:
- https://reyfarhan.com/posts/easy-gpt2-finetuning-huggingface/
- https://mccormickml.com/2019/07/22/BERT-fine-tuning/#4-train-our-classification-model
- https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel (the documentation)
- https://huggingface.co/transformers/custom_datasets.html

In [161]:
# load　GPT, BERT and support materials from huggingface
# requires pip install transformers
# if in jupyter notebook see here and you get an error mention ipython widgets see here: 
# https://stackoverflow.com/questions/53247985/tqdm-4-28-1-in-jupyter-notebook-intprogress-not-found-please-update-jupyter-an
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2LMHeadModel, top_k_top_p_filtering
from transformers import Trainer, TrainingArguments

In [40]:
train, valid, test = torchtext.datasets.PennTreebank(split=('train', 'valid', 'test'))
train = list(train) # these are originally iterators, the data is so small we can just retrieve all of it at once
valid = list(valid)
test  = list(test)

In [41]:
# Download the models
# Documentation for GPT: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

### Experimenting

In [97]:
# GPT2 example generation
text = f"Hugging Face is based in DUMBO, New York City, and"
tokens_tensor = gpt_tokenizer.encode(text, return_tensors="pt")

# Put everything on cuda
gpt_model.eval()
tokens_tensor = tokens_tensor.to('cuda')
gpt_model.to('cuda')

# Predict all tokens
generated = tokens_tensor
for i in range(20):
    next_token_logits = gpt_model(generated).logits[:, -1, :]
    # filter
    filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
    # sample
    probs = torch.nn.functional.softmax(filtered_next_token_logits, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)
    generated = torch.cat([generated, next_token], dim=-1)

resulting_string = gpt_tokenizer.decode(generated.tolist()[0])
resulting_string

"Hugging Face is based in DUMBO, New York City, and is an upcoming project by Columbia Pictures, the filmmakers behind the indie drama You're Missing Me. See"

In [8]:
tokens = gpt_tokenizer.encode(train[3])
print([gpt_tokenizer.decode([x]) for x in tokens])

[' rud', 'olph', ' <', 'unk', '>', ' N', ' years', ' old', ' and', ' former', ' chairman', ' of', ' consolidated', ' gold', ' fields', ' pl', 'c', ' was', ' named', ' a', ' nonex', 'ec', 'utive', ' director', ' of', ' this', ' b', 'rit', 'ish', ' industrial', ' conglomerate', ' ', '\n']


We can see that the gpt_tokenizer works differently to ours, splitting up names such as 'rudolph' into 'rud' and 'olph' and words such as nonexecutive and british. Hence our perplexity evaluation will have to be slightly different, using gpt_tokenizer to get the ground truth labels

### Data Processing

In [52]:
# Define a dataset class for fine-tuning, it's a generator so we don't have to store the entire dataset in memory
class GPT2Dataset(torch.utils.data.Dataset):
    def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=40):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        # Encode all the text, padding and truncuating it along with adding attention masks to get the sequence length the same across all samples
        for txt in txt_list:
            encodings_dict = tokenizer.encode_plus('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']).to(device))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']).to(device))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

In [167]:
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token # set the pad token
gpt_sequence_length = 40 # gpt splits up words into smaller tokens, so the sequence length should be longer
train_dataset = GPT2Dataset(train, gpt_tokenizer, max_length=gpt_sequence_length)
val_dataset = GPT2Dataset(valid, gpt_tokenizer, max_length=gpt_sequence_length)
train_dataset[1]

(tensor([   27,    91,  9688,  1659,  5239,    91,    29, 17748,   260,  1279,
          2954,    29,   399,   812,  1468,   481,  4654,   262,  3096,   355,
           257, 36196,   721,  8827,  3437,   645,    85,    13,   399,   220,
           198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0'))

In [168]:
# get output by passing the ids and the attention mask
gpt_model(input_ids=train_dataset[1][0], attention_mask=train_dataset[1][1], use_cache=False)

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[-31.7761, -30.6999, -32.1561,  ..., -39.5131, -39.7474, -31.6402],
        [-63.3376, -60.9081, -61.4334,  ..., -72.3227, -72.0701, -62.4869],
        [-53.8055, -53.4089, -53.4207,  ..., -63.6040, -62.0605, -54.8142],
        ...,
        [-82.7951, -76.3731, -78.5615,  ..., -95.3905, -96.0068, -84.1115],
        [-82.7967, -76.3781, -78.5659,  ..., -95.3878, -96.0029, -84.1119],
        [-82.8332, -76.4163, -78.6067,  ..., -95.4225, -96.0345, -84.1454]],
       device='cuda:0', grad_fn=<MmBackward>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [169]:
gpt_tokenizer.decode(train_dataset[1][0])

'<|startoftext|> pierre <unk> N years old will join the board as a nonexecutive director nov. N \n<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

### Model Definition

In [162]:
# Build a model wrapper for gpt2 that uses the "past" variable and for language modelling
# TODO: Fine tuning - need to add padding for backprop in torch to work (just add '.' at the end for padding)
# TODO: add the options for beam search
class GPTModel(torch.nn.Module):
    def __init__(self, model=None, sequence_length=20):
        super().__init__()
        self.gpt = model.to(device)
        self.tokenizer = gpt_tokenizer
        self.sequence_length = sequence_length
        self.vocab_size = self.tokenizer.vocab_size
    
    # output the logits for the most likely next word at each position in the sentence and optionally the loss if loss=True
    # note input_dataset must be an element taken from a GPT2Dataset class (e.g. train_dataset[0])
    def forward(self, input_dataset, loss=False):
        label_ids = input_dataset[0] if loss else None
        output = self.gpt.forward(input_ids = input_dataset[0], attention_mask=input_dataset[1], labels=label_ids, use_cache=False)
        if loss:
            return output["logits"], output["loss"]
        return output["logits"]
    
    # take in a sentence and output the predictions as in forward, but as the most likely sentence not logits
    def forward_sentence(self, input_dataset):
        preds = self.forward(input_dataset)
        tokens = torch.argmax(preds, dim=-1)
        return self.tokenizer.decode(tokens)
    
    # generate a sentence by sampling the next word from the probability distribution
    # set limit to an integer to generate `limit` number of words instead of ending at a full stop
    def random_gen(self, x, limit=None):
        # initialize variables
        generated = self.tokenizer.encode_plus(x, return_tensors="pt")['input_ids'].to('cuda')
        next_token = [generated[0][-1]]
        past = None
        raw_output= None
        
        # generate until a "." is generated
        while (limit is None and self.tokenizer.decode(next_token[0]) not in [".", "?", "!"]) or (limit is not None and len(generated[0]) < limit):
            # get output of model, using past if available
            if past is None:
                raw_output = self.gpt(generated, past_key_values=past)
            else:
                raw_output = self.gpt(next_token, past_key_values=past)
            output, past = raw_output['logits'], raw_output['past_key_values']
            next_token_logits = output[:, -1, :]
            
            # sample a token from the top 50 most likely words
            filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0) # filter to the top 50 tokens
            probs = torch.nn.functional.softmax(filtered_next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=-1)
            
        return self.tokenizer.decode(generated[0])
    
    # do beam_search to find the most likely sentence
    def beam_search(self, x, beam=5): 
        pass

In [163]:
modelgpt = GPTModel(model=gpt_model, sequence_length=gpt_sequence_length)
modelgpt.eval()
modelgpt.forward(train_dataset[0])

tensor([[-31.7761, -30.6999, -32.1561,  ..., -39.5131, -39.7474, -31.6402],
        [-63.3376, -60.9081, -61.4334,  ..., -72.3227, -72.0701, -62.4869],
        [-53.8055, -53.4089, -53.4207,  ..., -63.6040, -62.0605, -54.8142],
        ...,
        [-73.2748, -73.2788, -74.6521,  ..., -78.8758, -78.8732, -71.1970],
        [-85.5040, -84.4583, -85.6106,  ..., -92.3292, -91.5049, -83.1960],
        [-67.5806, -67.4142, -67.8963,  ..., -74.4724, -74.5060, -65.2784]],
       device='cuda:0', grad_fn=<MmBackward>)

In [100]:
modelgpt(train_dataset[0], loss=True)

(tensor([[-31.7761, -30.6999, -32.1561,  ..., -39.5131, -39.7474, -31.6402],
         [-63.3376, -60.9081, -61.4334,  ..., -72.3227, -72.0701, -62.4869],
         [-53.8055, -53.4089, -53.4207,  ..., -63.6040, -62.0605, -54.8142],
         ...,
         [-73.2748, -73.2788, -74.6521,  ..., -78.8758, -78.8732, -71.1970],
         [-85.5040, -84.4583, -85.6106,  ..., -92.3292, -91.5049, -83.1960],
         [-67.5806, -67.4142, -67.8963,  ..., -74.4724, -74.5060, -65.2784]],
        device='cuda:0', grad_fn=<MmBackward>),
 tensor(7.4033, device='cuda:0', grad_fn=<NllLossBackward>))

In [119]:
i = 17
modelgpt.forward_sentence(train_dataset[i])

'\n ||||end|- the following words members of been|now in.: of a acquiredun> diagnosed.------------------------\nTheTheTheTheTheTheTheTheTheTheTheThe'

In [114]:
with torch.no_grad():
    result = modelgpt.random_gen("Did you ever hear the tragedy of darth plaguies the wise? Darth plaguies was a", limit=200)
result

'Did you ever hear the tragedy of darth plaguies the wise? Darth plaguies was a villain who played an important role in the creation of the Sith. He was a very wise man, and many heroes died as a result of this, but when they became wise, he kept it safe in his home at the far end of his empire, as well as a nice little room to hide out.\n\nAnd what more could I ask for? I hope this post answers all of those questions.\n\nWhat is the Darth Plagui?\n\nThe Plagui, or "Gothic Plagui," is the dark side of Darth Vader\'s Force powers. In the original films, Plagui were part of the Sith, who eventually fell to the Dark Eldar. Plagui had the power of the dark side with an appearance that would have allowed them to use the power of Darth Vader to great effect if they truly wanted to. The story here'

### Evaluation

In [153]:
# if the mask is 0 at index i don't use the value at index i to calculate perplexity
def perplexity(preds, ground_truth, mask, epsilon=1e-30):
    probs = []
    for i in range(preds.shape[0]):
        if mask[i] != 0:
            probs.append(preds[i, int(ground_truth[i])])
    probs = np.array(probs)
    probs = np.power(1/(probs+epsilon), 1/probs.shape[0]) # normalise before taking the product, to prevent underflowing to 0
    return np.prod(probs).detach().cpu().numpy()

def average_perplexity_gpt(model, train, print_results=False):
    perplexities = []
    with torch.no_grad():
        for i in range(len(train)):
            # Get new ground truth labels using gpt_tokenizer
            labels = train[i][0][1:]
            mask = train[i][1][:-1]
            preds = model.forward(train[i])[:-1] # remove the last prediction as there is no ground truth 
            preds = torch.nn.functional.softmax(preds, dim=-1)
            perplexities.append(perplexity(preds, labels, mask))

            if i % 100 == 0 and print_results:
                print("Sentences analysed: {} Average perplexity: {}".format(i, np.mean(perplexities)))
    return np.mean(perplexities)

In [155]:
average_perplexity_gpt(modelgpt, list(zip(*train_dataset[:2000])), print_results=True)

Sentences analysed: 0 Average perplexity: 1641.3529052734375
Sentences analysed: 100 Average perplexity: 558.3963623046875
Sentences analysed: 200 Average perplexity: 617.5173950195312
Sentences analysed: 300 Average perplexity: 613.2991943359375
Sentences analysed: 400 Average perplexity: 605.2006225585938
Sentences analysed: 500 Average perplexity: 588.6340942382812
Sentences analysed: 600 Average perplexity: 581.43017578125
Sentences analysed: 700 Average perplexity: 579.1553955078125
Sentences analysed: 800 Average perplexity: 587.950439453125
Sentences analysed: 900 Average perplexity: 594.396728515625
Sentences analysed: 1000 Average perplexity: 592.6068725585938
Sentences analysed: 1100 Average perplexity: 598.1767578125
Sentences analysed: 1200 Average perplexity: 613.7470703125
Sentences analysed: 1300 Average perplexity: 604.9384155273438
Sentences analysed: 1400 Average perplexity: 614.0787963867188
Sentences analysed: 1500 Average perplexity: 605.5595703125
Sentences analys

601.60114

### Fine Tuning

Do fine tuning using the hugging face out of the box trainer https://huggingface.co/transformers/custom_datasets.html#fine-tuning-with-trainerfrom 

In [172]:
training_args = TrainingArguments(
    output_dir='gpt_finetuning',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    logging_dir='gpt_finetuning_logs',            # directory for storing logs
    logging_steps=10,
)

model = gpt_model

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

TypeError: vars() argument must have __dict__ attribute

In [36]:
# perform fine tuning, using stochastic gradient descent to prevent OOM error
# to compensate for this we use a smaller learning rate and large beta parameter (for adam optimization) to counter that
def finetune_gpt(model, train, epochs=10, learn_rate=0.00001, betas=(0.99, 0.9995), weight_decay=0.000001, print_results=True):
    # Ensure this runs on gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Prepare data: get labels
    train = model.tokenizer.batch_encode_plus(train)['input_ids']
    labels = [s[1:] for s in train]
    train = [model.tokenizer.decode(train[i][:-1]) for i in range(len(train))] # the model expects raw string input so decode it again
    X = np.array(train)
    n_samples = X.shape[0]
    
    # Pad the labels (the output of model.forward is padded TODO: fix this). Then convert them to a tensor
    y = np.zeros((n_samples, gpt_sequence_length))
    for index in range(len(y)):
        y_i = labels[index][:gpt_sequence_length] + [0]*max(gpt_sequence_length - len(labels[index]), 0) 
        y_i = np.array(y_i, dtype=int).reshape(1, gpt_sequence_length)
        y[index, :] = y_i
    y = torch.from_numpy(y).to(device).long()
    
    
    # Define loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate, betas=betas, weight_decay=weight_decay)
    
    for epoch in range(epochs):      
        model.train() # set to train flag
        start_ts = time()
        
        # shuffle the data
        new_indices = torch.randperm(n_samples)
        X = X[new_indices]
        y = y[new_indices]
        
        # Stochastic gradient descent
        for index in range(n_samples):
            optimizer.zero_grad()
            outputs = model.forward(X[index]) 
            outputs = torch.swapaxes(outputs, 1, 2) # cross entropy expects a tensor of (n_samples, n_outputs, sequence_length)
            
            # Perform optimization step
            loss = criterion(outputs, y[index:index+1, :])
            loss.backward()
            optimizer.step()
            
            # show progress by printing a # every 5% of training data completed
            if print_results and index % int(n_samples/20) == 0 and index != 0:
                print('#', end='')
            
        # evaluate performance on part of the data (for memory reasons we take a subsample)
        if print_results:
            print()
            with torch.no_grad():
                model.eval()
                t_perplexity = average_perplexity_gpt(model, train[:100], print_results=False)
                v_perplexity = average_perplexity_gpt(model, valid[:100], print_results=False)
                end_ts = time()
                print("Epoch {}, Train Perplexity: {:.2f}, Validation Perplexity: {:.2f}, Epoch Time: {:.2f} seconds".format(
                    epoch, t_perplexity, v_perplexity, end_ts - start_ts))
    
    torch.cuda.empty_cache()
    if print_results:
        print('Finished Training')
    return model

In [37]:
modelgpt = finetune_gpt(modelgpt, train, epochs=1, learn_rate=0.00001,  betas=(0.99, 0.9995), weight_decay=0.000001, print_results=True)

torch.Size([42068, 40])
####################Epoch 0, Train Perplexity: 6004359.50, Validation Perplexity: 133683.17, Epoch Time: 23799.71 seconds
Finished Training


Something about this is adversarial because the train perplexity is much higher than the validation perplexity

In [38]:
modelgpt.gpt.save_pretrained('Models\') 

SyntaxError: EOL while scanning string literal (<ipython-input-38-e14df2f90c2b>, line 1)

Perhaps try this:
- This has a guide on their fine tuning API: https://huggingface.co/transformers/custom_datasets.html , it's likely to be more optimized than ours. The penntreebank may be available from them
- Manual script for fine tuning https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm.py

GPT2 Results
- No fine tuning: 706.16187 perplexity
- Fine tuning: 

# Neural ODE Model

In [15]:
import sys ; sys.path.append('../')
from torchdyn.models import *
from torchdyn.datasets import *
from torchdyn import *

In [16]:
# an Neural ODE that uses an LSTMCell as the derivative function
class ODELSTM(torch.nn.Module):
    def __init__(self, vocab_size, input_size=100, layer_size=100, dropout=0):
        super().__init__()
        self.LSTM = torch.nn.LSTM(input_size, layer_size, 1, bidirectional=False) # this encodes the sequence
        #self.fnode = torch.nn.LSTMCell(input_size, layer_size, 1, bidirectional=False)
        self.f = torch.nn.Sequential(
            torch.nn.Linear(layer_size, layer_size),
            torch.nn.ReLU(),
            torch.nn.Linear(layer_size, layer_size),
        )
        self.node = NeuralDE(self.f, sensitivity='adjoint', solver='dopri5').to(device)
        self.linear = torch.nn.Linear(layer_size, vocab_size)
        self.softmax = torch.nn.Softmax(dim=-1)
    
    def forward(self, x):
        # at the moment this feeds the entire sequence to LSTM and asks Neural ODE to reproduce it
        # TODO: switch to feeding half the sequence and asking NeuralODE to extrapolate
        sequence_outputs, hidden_state = self.LSTM(x)
        final_hidden = sequence_outputs[:, -1, :]

        # feed to neural ode
        timesteps = torch.linspace(0, sequence_length-1, sequence_length).to(device)
        sequence_outputs = self.node.trajectory(final_hidden, timesteps) # input is 128 final hidden states of dimension 300
        sequence_outputs = torch.swapaxes(sequence_outputs, 0, 1)        # output is output across 20 timesteps giving as (20, 128, 300) output, so swap the sequence and batch dimension
        
        # Get final output
        pred = self.linear(sequence_outputs)
        return pred
    
    # wrapper function that forward propagates, applies softmax and converts to numpy 
    def predict(self, x):
        preds = self.forward(x)
        preds = self.softmax(preds).detach().cpu().numpy()
        return preds

In [20]:
NODEmodel = ODELSTM(vocab_size, input_size=300, layer_size=300)

In [21]:
NODEmodel = train_model(NODEmodel, train_X, train_y, epochs=10, learn_rate=0.001, weight_decay=0)

Epoch 0, Minibatch loss: 5.62, Subsample Accuracy: 0.19, Train Perplexity: 992.81, Validation Perplexity: 949.61, Epoch Time: 334.68 seconds
Epoch 1, Minibatch loss: 5.38, Subsample Accuracy: 0.18, Train Perplexity: 825.13, Validation Perplexity: 815.57, Epoch Time: 438.83 seconds
Epoch 2, Minibatch loss: 5.46, Subsample Accuracy: 0.19, Train Perplexity: 718.59, Validation Perplexity: 734.18, Epoch Time: 517.79 seconds
Epoch 3, Minibatch loss: 5.50, Subsample Accuracy: 0.19, Train Perplexity: 699.71, Validation Perplexity: 762.84, Epoch Time: 598.56 seconds
Epoch 4, Minibatch loss: 5.32, Subsample Accuracy: 0.18, Train Perplexity: 602.00, Validation Perplexity: 666.84, Epoch Time: 600.94 seconds
Epoch 5, Minibatch loss: 5.17, Subsample Accuracy: 0.18, Train Perplexity: 601.95, Validation Perplexity: 663.96, Epoch Time: 599.20 seconds
Epoch 6, Minibatch loss: 5.11, Subsample Accuracy: 0.18, Train Perplexity: 1000.37, Validation Perplexity: 742.20, Epoch Time: 607.65 seconds
Epoch 7, Min

In [37]:
del NODEmodel
torch.cuda.empty_cache()