In [1]:
import tensorflow as tf
from keras import layers
import numpy as np
import pickle

In [2]:
def create_padding_mask(seq, n=4):
    mask = tf.cast(seq == 0, tf.float32)
    return tf.reshape(mask, (tf.shape(mask)[0], *(1,) * (n - 2), tf.shape(mask)[-1]))


def create_look_ahead_mask(seq_len):
    mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    return tf.cast(mask, tf.float32)


def create_mask(inp, n=4):
    padding_mask = create_padding_mask(inp, n)
    seq_len = tf.shape(inp)[1]  # Get the sequence length dynamically
    look_ahead_mask = create_look_ahead_mask(seq_len)
    return tf.maximum(padding_mask, look_ahead_mask)

In [3]:
def positional_encoding(position, d_model):
    def get_angles(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model,
    )

    # apply sin to even indices in the array; 2i
    sines = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    cosines = np.cos(angle_rads[:, 1::2])

    pos_encoding = np.concatenate([sines, cosines], axis=-1)

    return tf.cast(pos_encoding[np.newaxis, ...], dtype=tf.float32)

In [4]:
def skew(tensor):

  paddings = [[0, 0] for _ in range(len(tensor.shape) - 1)]
  padded = tf.pad(tensor, [*paddings, [1, 0]])

  Srel = tf.reshape(padded, (-1, tensor.shape[-1] + 1, tensor.shape[-2]))[:, 1:]
  return tf.cast(tf.reshape(Srel, tensor.shape), tensor.dtype)

In [5]:
class PositionWiseFeedForwardNetwork(layers.Layer):
    def __init__(self, embed_dim, ffn_dim):
        super(PositionWiseFeedForwardNetwork, self).__init__()
        self.embed_dim = embed_dim
        self.ffn_dim = ffn_dim

        self.dense1 = layers.Dense(ffn_dim, activation="relu")
        self.dense2 = layers.Dense(embed_dim)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return x

In [6]:
# create a class that applies relative positional encoding
class RelativePositionalEncoding(layers.Layer):
    def __init__(self, embed_dim, max_seq_len):
        super(RelativePositionalEncoding, self).__init__()
        self.pos_encoding = None
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len

    def build(self, input_shape):
        self.pos_encoding = self.add_weight(
            shape=(self.max_seq_len, self.embed_dim),
            initializer="HeNormal",
            trainable=True,
        )

    def call(self, inputs):
        x = inputs + self.pos_encoding
        return x

In [7]:

class MultiHeadAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        self.depth = embed_dim // num_heads

        self.wq = layers.Dense(embed_dim)
        self.wk = layers.Dense(embed_dim)
        self.wv = layers.Dense(embed_dim)

        self.dense = layers.Dense(embed_dim)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        q, k, v, mask = inputs
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # apply relative positional encoding
        q = RelativePositionalEncoding(self.embed_dim, q.shape[2])(q)
        k = RelativePositionalEncoding(self.embed_dim, k.shape[2])(k)

        # apply skew function
        k = skew(k)

        # calculate attention
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # apply mask
        if mask is not None:
            scaled_attention_logits += mask * -1e9

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        attention = tf.matmul(attention_weights, v)

        # concat attention
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))

        # apply dense layer
        output = self.dense(concat_attention)
        return output, attention_weights

In [8]:

class DecoderBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ffn_dim, dropout_rate=0.1):
        super(DecoderBlock, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ffn_dim = ffn_dim
        self.dropout_rate = dropout_rate

        self.att = MultiHeadAttention(embed_dim, num_heads)
        self.ffn = PositionWiseFeedForwardNetwork(embed_dim, ffn_dim)
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs):
        x, encoding = inputs
        # create a mask for the decoder inputs
        causal_mask = tf.linalg.band_part(tf.ones((tf.shape(x)[1], tf.shape(x)[1])), -1, 0)
        causal_mask = tf.reshape(causal_mask, (1, causal_mask.shape[0], causal_mask.shape[1]))
        causal_mask = tf.cast(causal_mask, tf.bool)
        # create a mask for the encoder inputs
        padding_mask = tf.math.equal(x, 0)
        padding_mask = tf.reshape(padding_mask, (tf.shape(x)[0], 1, 1, tf.shape(x)[1]))
        padding_mask = tf.cast(padding_mask, tf.bool)
        # create a mask for the encoder inputs
        combined_mask = tf.maximum(causal_mask, padding_mask)
        # apply multi-head attention
        x = self.att([x, x, x, combined_mask])
        x = self.dropout1(x)
        x = self.layernorm1(x + encoding)
        # apply feed forward network
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.layernorm2(x + encoding)
        return x

In [9]:
#Create the decoder class that uses relative positional encoding and also allows for a matrix input of x by 16 size and a matrix output of 16 different outputs with softmax sizes of: [129,129,129,129,129,2,200,32,32,128,64,30,3000,129,5,129]
class Decoder(layers.Layer):
    def __init__(self, num_layers, embed_dim, num_heads, ffn_dim, vocab_size, max_seq_len, dropout_rate=0.1):
        super(Decoder, self).__init__()
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.ffn_dim = ffn_dim
        self.vocab_size = vocab_size
        self.max_seq_len = max_seq_len
        self.dropout_rate = dropout_rate

        self.embedding = layers.Embedding(vocab_size, embed_dim)
        self.pos_embedding = RelativePositionalEncoding(embed_dim, max_seq_len)
        self.dropout = layers.Dropout(dropout_rate)
        self.decoder_blocks = [DecoderBlock(embed_dim, num_heads, ffn_dim, dropout_rate) for _ in range(num_layers)]
        self.final_layer = layers.Dense(vocab_size)

    def call(self, inputs):
        x, encoding = inputs
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        # create embeddings
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
        x = self.pos_embedding(x)
        x = self.dropout(x)
        # apply decoder blocks
        for i in range(self.num_layers):
            x = self.decoder_blocks[i]([x, encoding])
        # final dense layer
        x = self.final_layer(x)
        return x, attention_weights

In [10]:
# create custom loss function
class CustomLoss(layers.Layer):
    def __init__(self, name="custom_loss"):
        super(CustomLoss, self).__init__(name=name)

    def call(self, y_true, y_pred):
        loss = tf.keras.losses.sparse_categorical_crossentropy(
            y_true, y_pred, from_logits=True
        )
        loss = tf.reduce_mean(loss)
        return loss

In [11]:
# create learning rate schedule
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, embed_dim, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.embed_dim = embed_dim
        self.embed_dim = tf.cast(self.embed_dim, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.embed_dim) * tf.math.minimum(arg1, arg2)

In [12]:
# create training loop
def train_model(model, dataset, epochs, learning_rate):
    optimizer = tf.keras.optimizers.Adam(learning_rate)
    train_loss = tf.keras.metrics.Mean(name="train_loss")
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name="train_accuracy"
    )
    loss_fn = CustomLoss()
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))
        for step, (x_batch_train, y_batch_train) in enumerate(dataset):
            with tf.GradientTape() as tape:
                logits = model(x_batch_train, training=True)
                loss = loss_fn(y_batch_train, logits)
            gradients = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
            train_loss(loss)
            train_accuracy(y_batch_train, logits)
            if step % 50 == 0:
                print(
                    "Training loss (for one batch) at step %d: %.4f"
                    % (step, float(train_loss.result()))
                )
                print(
                    "Training accuracy (for one batch) at step %d: %.4f"
                    % (step, float(train_accuracy.result()))
                )


In [13]:
# create model
def create_model(num_layers, embed_dim, num_heads, ffn_dim, vocab_size, max_seq_len, dropout_rate):
    inputs = layers.Input(shape=(None,), dtype=tf.int64)
    encoding = layers.Input(shape=(None, None))
    x = Decoder(num_layers, embed_dim, num_heads, ffn_dim, vocab_size, max_seq_len, dropout_rate)([inputs, encoding])
    return tf.keras.Model(inputs=[inputs, encoding], outputs=x)

In [14]:
# initialize parameters
num_layers = 2
embed_dim = 128
num_heads = 12
ffn_dim = 512
vocab_size = 2998560
dropout_rate = 0.1
max_seq_len = 102188
learning_rate = CustomSchedule(embed_dim)
epochs = 10

In [23]:
def create_tokenized_data(data):
    input_data = []
    output_data = []
    for i in range(0, len(data)):
        cur_input = np.insert(data[:i], 0, -1)
        #padd it to length 102188
        cur_input = np.pad(cur_input, (0, 102188 - len(cur_input)), 'constant', constant_values=(0))
        input_data.append(cur_input)
        output_data.append(data[i])
    return input_data,output_data

In [24]:
with (open("all_data_2.pickle", "rb")) as openfile:
    data = pickle.load(openfile)

In [25]:
dataset = create_tokenized_data(data[0])

In [26]:
dataset

([array([-1.,  0.,  0., ...,  0.,  0.,  0.]),
  array([-1.000e+00,  9.943e+03,  0.000e+00, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
          0.000e+00]),
  array([-1.000e+00,  9.943e+03,  9.949e+03, ...,  0.000e+00,  0.000e+00,
       

In [27]:
# begin training
model = create_model(num_layers, embed_dim, num_heads, ffn_dim, vocab_size, max_seq_len, dropout_rate)
#create numpy randomized dataset (2D arrays that have 16 long rows)for training

train_model(model, dataset, epochs, learning_rate)

TypeError: Exception encountered when calling layer "decoder_1" (type Decoder).

in user code:

    File "C:\Users\ilove\AppData\Local\Temp\ipykernel_19708\2130900575.py", line 30, in call  *
        x = self.decoder_blocks[i]([x, encoding])
    File "c:\Users\ilove\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\ilove\AppData\Local\Temp\__autograph_generated_file6fo163rv.py", line 17, in tf__call
        combined_mask = ag__.converted_call(ag__.ld(tf).maximum, (ag__.ld(causal_mask), ag__.ld(padding_mask)), None, fscope)

    TypeError: Exception encountered when calling layer 'decoder_block_2' (type DecoderBlock).
    
    in user code:
    
        File "C:\Users\ilove\AppData\Local\Temp\ipykernel_19708\1570516264.py", line 27, in call  *
            combined_mask = tf.maximum(causal_mask, padding_mask)
    
        TypeError: Value passed to parameter 'x' has DataType bool not in list of allowed values: bfloat16, float16, float32, float64, int8, uint8, int16, uint16, int32, uint32, int64, uint64
    
    
    Call arguments received by layer 'decoder_block_2' (type DecoderBlock):
      • inputs=['tf.Tensor(shape=(None, 102188, 128), dtype=float32)', 'tf.Tensor(shape=(None, None, None), dtype=float32)']


Call arguments received by layer "decoder_1" (type Decoder):
  • inputs=['tf.Tensor(shape=(None, None), dtype=int64)', 'tf.Tensor(shape=(None, None, None), dtype=float32)']