In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import tqdm
import time

from transformers import BertTokenizer, TFBertModel, TFGPT2Model, GPT2Tokenizer, TFGPT2LMHeadModel
from Attention import AttentionUtils

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '<PAD>'})
vocab_size = tokenizer.vocab_size

gpt2 = TFGPT2LMHeadModel.from_pretrained('gpt2')
# gpt2lm = TFGPT2LMHeadModel.from_pretrained('gpt2')
# gpt2lm.summary()
gpt2.summary()


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgp_t2lm_head_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFGPT2MainLayer multiple                  124439808 
Total params: 124,439,808
Trainable params: 124,439,808
Non-trainable params: 0
_________________________________________________________________


In [3]:
def tokenize(s, max_len=128):
    tok = tokenizer.encode(bytes.decode(s.numpy()), max_length=max_len, padding='max_length')
    return tf.constant(tok, dtype=tf.int32)


def shift(x):
    return x[:, :-1], x[:, 1:]

In [4]:
sentences = tf.data.experimental.load(
    'sentences_raw_gpttokens.tfrecord', compression='GZIP')
print(len(sentences))
for s in sentences.batch(2).take(1):
    print(tokenizer.decode(s[0], skip_special_tokens=True))
    print(tokenizer.decode(s[1], skip_special_tokens=True))

492686
the deep space nine transcripts - emissary emissary stardate: 46379.1 original airdate: 3 jan, 1993 on stardate 43997, captain jean-luc picard of the federation starship enterprise was kidnapped for six days by an invading force known as the borg.
 surgically altered, he was forced to lead an assault on starfleet at wolf 359.


In [5]:
ratios = (0.8, 0.1, 0.1)
assert sum(ratios) == 1

BUFFER_SIZE = 10000
BATCH_SIZE = 32

sentences = tf.data.experimental.load(
    'sentences_raw_gpttokens.tfrecord', compression='GZIP')
sentences = sentences.take(100*BATCH_SIZE).batch(2).shuffle(BUFFER_SIZE)
cardinality = len(sentences)
train_dataset = sentences.take(int(ratios[0] * cardinality))
valid_dataset = sentences.skip(int(ratios[0] * cardinality)).take(int(ratios[1] * cardinality))
test_dataset = sentences.skip(int(ratios[0] * cardinality) + int(ratios[1] * cardinality))

# shuffle(BUFFER_SIZE, reshuffle_each_iteration=True)?

train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = valid_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)


In [6]:
print(len(sentences), len(train_dataset), len(valid_dataset), len(test_dataset))

1600 40 5 5


In [41]:
def reorder(s):
    return s[:,0,:], s[:,1,:]

In [42]:
for s in train_dataset.map(reorder).take(2):
    print(s)
    # ss = [t for t in s[0].numpy() if t != tokenizer.pad_token_id]
    # ssp = [t for t in s[1].numpy() if t != tokenizer.pad_token_id]
#     print(tokenizer.decode(s[0].numpy(), skip_special_tokens=True))
#     print(tokenizer.decode(s[1].numpy(), skip_special_tokens=True))
#     # greedy_out = gpt2lm.generate(tf.constant(ss)[tf.newaxis, :], max_length=len(ss)*2)
#     greedy_out = gpt2lm(s[0])['logits']
#     greedy_out = tf.argmax(greedy_out, axis=-1)
#     # print(greedy_out)
#     print(tokenizer.decode(greedy_out, skip_special_tokens=True))
# #     break


(<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[16106,    25,  1277, ..., 50257, 50257, 50257],
       [  264,  1984,    78, ..., 50257, 50257, 50257],
       [  288,   897,    11, ..., 50257, 50257, 50257],
       ...,
       [  264,  1984,    78, ..., 50257, 50257, 50257],
       [ 3644,    25,  4686, ..., 50257, 50257, 50257],
       [ 1976,   323,   430, ..., 50257, 50257, 50257]])>, <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[  264,  1984,    78, ..., 50257, 50257, 50257],
       [27334,   343,    25, ..., 50257, 50257, 50257],
       [  288,   897,    25, ..., 50257, 50257, 50257],
       ...,
       [  267,  4598,    25, ..., 50257, 50257, 50257],
       [  264,  1984,    78, ..., 50257, 50257, 50257],
       [33217,  7143,   353, ..., 50257, 50257, 50257]])>)
(<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[  264,  1984,    78, ..., 50257, 50257, 50257],
       [  969,    76,   461, ..., 50257, 50257, 50257],
       [  783,    11,   922, ..., 502

In [37]:
class HFModel(tf.keras.Model):
    def __init__(self, model, vocab_size, dense, output_dense=True, make_base_trainable=False):
        super(HFModel, self).__init__()
        self.output_dense = output_dense
        self.model = model
        self.model.trainable = make_base_trainable
        self.dense = tf.keras.layers.Dense(dense, activation='relu')
        self.output_dense = tf.keras.layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs):
        model_outs = self.model(inputs).last_hidden_state
        # hidden_dense = self.dense(model_outs)        
        outputs = self.output_dense(model_outs)
        return outputs
        # return tf.argmax(outputs, axis=-1)


In [38]:
# model = HFModel(gpt2, vocab_size, 256, output_dense=False)
model = gpt2
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True), metrics=[tf.metrics.SparseCategoricalAccuracy()])


TypeError: 'NoneType' object is not subscriptable

In [49]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [50]:
for s in train_dataset.map(reorder).take(2):
    print(s[0])
    # print(model(s[0])['logits'].shape)

tf.Tensor(
[[  618 47161 46078 ... 50257 50257 50257]
 [  477   584  2243 ... 50257 50257 50257]
 [ 4999    13 50257 ... 50257 50257 50257]
 ...
 [   58  2840    60 ... 50257 50257 50257]
 [  275  1518   272 ... 50257 50257 50257]
 [    8   264  1984 ... 50257 50257 50257]], shape=(32, 128), dtype=int32)
tf.Tensor(
[[  264  1984    78 ... 50257 50257 50257]
 [ 3644    25  6509 ... 50257 50257 50257]
 [  264  1984    78 ... 50257 50257 50257]
 ...
 [  267  4598    25 ... 50257 50257 50257]
 [  479  8704    25 ... 50257 50257 50257]
 [  262  3173   389 ... 50257 50257 50257]], shape=(32, 128), dtype=int32)


In [43]:
model.fit(train_dataset.map(reorder), epochs=1, steps_per_epoch=100)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


InvalidArgumentError:  required broadcastable shapes at loc(unknown)
	 [[node Equal_1 (defined at \.conda\envs\tf-gpu\lib\site-packages\transformers\modeling_tf_utils.py:884) ]] [Op:__inference_train_function_16818]

Function call stack:
train_function


In [None]:
# model.fit(train_dataset.batch(BATCH_SIZE).map(mask_last), epochs=3,
#           validation_data=valid_dataset.batch(BATCH_SIZE).map(mask_last))

for s in sentences.batch(3).take(1):
    print(s)
    # print("Output shape", model(s[:,0,:])['logits'].shape)
    # print("Label shape", s[:,1,:].shape)


tf.Tensor(
[[[ 5170   257  2657 46091  2125   470  3177   881   286   257  4065
   26760    13 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257]
  [  264  1984    78    25  1595   470  2128   588   612   338   881
     356   460   466   546   340    13 50257 50257 50257 50257 50257
   50257 50257 50257 50257 50257 50257 50257 5

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = AttentionUtils.mask_loss(tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none'), tokenizer.pad_token_id)

acc_function = AttentionUtils.get_masked_acc(tokenizer.pad_token_id)

checkpoint_path = "./checkpoints/GPT2LM/train"

ckpt = tf.train.Checkpoint(transformer=model,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

# if a checkpoint exists, restore the latest checkpoint.
# if ckpt_manager.latest_checkpoint:
#   ckpt.restore(ckpt_manager.latest_checkpoint)
#   print('Latest checkpoint restored!!')


In [None]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function()
def train_step(inp, tar):
    with tf.GradientTape() as tape:

        # print("Input shape", inp.shape, tar.shape)

        predictions = model(inp, training=True)['logits']
        # predictions = tf.cast(predictions, tf.float32)
        # print("Predictions type", predictions.dtype, tar.dtype)
        # print("Predictions shape", predictions.shape, tar.shape)
        loss = loss_function(tar, predictions)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        train_loss(loss)
        train_accuracy(acc_function(tar, predictions))


In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    print(f"\nEPOCH {epoch+1}/{EPOCHS}:")
    pbar = tqdm.tqdm(train_dataset)
    for data in pbar:
        inp, tar = data[:,0,:], data[:,1,:]

        train_step(inp, tar)
        pbar.set_description(
            f"loss: {train_loss.result():.5f}, accuracy: {train_accuracy.result():.5f}")
        ckpt_save_path = ckpt_manager.save()

    print(f"loss {train_loss.result()}\t accuracy {train_accuracy.result()} in {np.round(time.time()-start, 2)} seconds")



EPOCH 1/1:


  0%|          | 0/40 [00:00<?, ?it/s]

Input shape (32, 128) (32, 128)
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Input shape (32, 128) (32, 128)


  0%|          | 0/40 [00:20<?, ?it/s]


InternalError:  'cuModuleGetFunction(&function, module, kernel_name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
	 [[node sparse_categorical_crossentropy/clip_by_value (defined at c:\Other Projects\NeuralNetworks\Project 3\Attention.py:187) ]] [Op:__inference_train_step_20533]

Function call stack:
train_step
