# Transformer

This notebook wil contain code for creating the dataset from the sequences that were created in the previous notebook. After the dataset is created, the model will be build and created.

In [None]:
# Imports
import numpy as np

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization, Embedding, TextVectorization, Input, MultiHeadAttention

In [None]:
# Constants
DATA_PATH = './data/movie_reviews.npy'
VOCAB_PATH = './vocab.txt'
MODEL_PATH = './model/transformer.keras'
VOCAB_SIZE = 5000
SEQ_LENGTH = 200
BATCH_SIZE = 32
EMBEDDING_DIM = 768
DENSE_1_NEURONS = 3072
HEADS = 12
KEY_DIM = EMBEDDING_DIM // HEADS
MAX_LEN_UNIQUE_POS_EMBED = 1000
DROP_RATE=0.1
EPOCHS = 5

## 1. Load data

In [None]:
sequences = np.load(DATA_PATH)

## 2. Create dataset

In [None]:
# Create Tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices(sequences).batch(BATCH_SIZE).shuffle(1000)

# Create vocab
vectorize_layer = TextVectorization(
    standardize='lower',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=SEQ_LENGTH + 1
)
vectorize_layer.adapt(dataset)
vocab = vectorize_layer.get_vocabulary()

# Save vocab
with open(VOCAB_PATH, "w", encoding="utf-8") as f:
    for word in vocab:
        f.write(word + "\n")

In [None]:
# Create the train dataset with x and y
def create_train_dataset(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y
train_dataset = dataset.map(create_train_dataset)

## 3. Transformer model

In [None]:
class PositionalEmbedding(Layer):
  def __init__(self, vocab_size, embedding_dim, max_unique_pos_embed, **kwargs):
    super(PositionalEmbedding, self).__init__(**kwargs)
    self.vocab_size = vocab_size
    self.max_unique_pos_embed = max_unique_pos_embed
    self.embedding_dim = embedding_dim
    self.token_embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    self.position_embedding_layer = Embedding(input_dim=max_unique_pos_embed, output_dim=embedding_dim)

  def call(self, x):

    maxlen = shape(x)[-1]

    positions = range(start=0, limit=maxlen, delta=1)
    positions = self.position_embedding_layer(positions)

    x = self.token_embedding_layer(x)

    output = x + positions

    return output

In [None]:
class SelfAttention(Layer):
  def __init__(self, **kwargs):
    super(SelfAttention, self).__init__(**kwargs)

  def call(self, query, keys, values, keys_dim, mask=None):

    # Calculate scores
    scores = tf.matmul(query, keys, transpose_b=True) / tf.math.sqrt(tf.cast(keys_dim, tf.float32))

    # Prevent attention to future tokens
    if mask is not None:
      mask = tf.cast(mask, dtype=tf.float32)

      # Expand mask shape to [batch_size, 1, seq_len, seq_len]
      mask = tf.expand_dims(mask, axis=1)
      scores += (mask * -1e9)

    # Calculate weights with softmax
    attention_weights = tf.nn.softmax(scores, axis=-1)

    # Calculate the output
    output = tf.matmul(attention_weights, values)

    return output

# Test
query = tf.random.normal((BATCH_SIZE, SEQ_LENGTH, KEY_DIM))
keys = tf.random.normal((BATCH_SIZE, SEQ_LENGTH, KEY_DIM))
values = tf.random.normal((BATCH_SIZE, SEQ_LENGTH, KEY_DIM))

self_attention = SelfAttention()
output = self_attention(query, keys, values, keys_dim=KEY_DIM)

print(f"Output shape: {output.shape}")
print(output)

Output shape: (32, 200, 64)
tf.Tensor(
[[[-0.01684658 -0.02843867  0.13636388 ... -0.01269964  0.21264365
   -0.10245864]
  [ 0.05744156 -0.07425009 -0.16975887 ...  0.14768438 -0.1101056
   -0.15561865]
  [ 0.10218031  0.05113341 -0.00909905 ... -0.00203522 -0.06763754
    0.07966353]
  ...
  [ 0.02877847  0.00839305 -0.11123461 ... -0.06628624  0.06627677
   -0.01747258]
  [-0.03113098  0.1329103   0.03487611 ... -0.03849317  0.12371826
   -0.05110065]
  [ 0.04300503 -0.01798058 -0.07151026 ... -0.1153248   0.02136786
   -0.01359177]]

 [[-0.04039139 -0.11168819 -0.20691945 ...  0.19812664  0.12778094
   -0.02614967]
  [-0.01306725  0.10088292  0.01217076 ...  0.09959167  0.00678044
    0.19562873]
  [ 0.04125831 -0.03818874 -0.12444432 ...  0.09522966 -0.05494731
    0.05533848]
  ...
  [-0.07768421 -0.00514741  0.00772872 ...  0.17633076  0.19051944
   -0.05156904]
  [ 0.17175764  0.3017796  -0.07001042 ...  0.08506124 -0.00249469
    0.21139127]
  [-0.07381722  0.02054632  0.01932

In [None]:
class MultiHeadAttention(Layer):
  def __init__(self, heads, embedding_dim, **kwargs):
    super(MultiHeadAttention, self).__init__(**kwargs)
    self.heads = heads
    self.embedding_dim = embedding_dim
    self.self_attention = SelfAttention()
    self.W_q = Dense(embedding_dim)
    self.W_k = Dense(embedding_dim)
    self.W_v = Dense(embedding_dim)
    self.W_o = Dense(embedding_dim)

  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.heads, self.embedding_dim // self.heads))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def concatenate_heads(self, attention, batch_size):
    attention = tf.transpose(attention, perm=[0, 2, 1, 3])
    return tf.reshape(attention, (batch_size, -1, self.heads * (self.embedding_dim // self.heads)))


  def call(self, queries, keys, values, mask=None):

    # Extract batch size (32)
    batch_size = tf.shape(queries)[0]

    # Split the queries, keys, values from
    # (32, 200, 768) -> (32, 12, 200, 64)
    Q = self.split_heads(self.W_q(queries), batch_size)
    K = self.split_heads(self.W_k(keys), batch_size)
    V = self.split_heads(self.W_v(values), batch_size)

    # Apply attention to all 12 heads
    attention = self.self_attention(Q, K, V, keys_dim=self.embedding_dim // self.heads, mask=mask)

    # Concatenate all heads together
    # (32, 12, 200, 64) -> (32, 200, 768)
    concatenated_attention = self.concatenate_heads(attention, batch_size)

    # Calculate a last linear transformation to get the output
    output = self.W_o(concatenated_attention)

    return output

# Test
query = tf.random.normal((BATCH_SIZE, SEQ_LENGTH, KEY_DIM))
keys = tf.random.normal((BATCH_SIZE, SEQ_LENGTH, KEY_DIM))
values = tf.random.normal((BATCH_SIZE, SEQ_LENGTH, KEY_DIM))

multi_head_attention = MultiHeadAttention(HEADS, EMBEDDING_DIM)
output = multi_head_attention(query, keys, values)
print(f"Output shape: {output.shape}")
print(output)

Output shape: (32, 200, 768)
tf.Tensor(
[[[ 2.21717469e-02  4.14373726e-02  7.46395905e-03 ...  1.71128183e-03
   -5.63040702e-03 -2.83440622e-03]
  [ 3.24134529e-02  3.97914462e-02  4.77555487e-03 ...  1.08526545e-02
   -8.11995193e-03 -1.24990698e-02]
  [ 3.19103301e-02  4.00614031e-02  1.12193609e-02 ...  8.76557548e-03
   -5.27278567e-03 -3.98530671e-03]
  ...
  [ 3.28454003e-02  3.24166082e-02  9.83953848e-03 ...  1.29540237e-02
    9.36508295e-04 -1.09021310e-02]
  [ 3.23331542e-02  3.87845188e-02  1.29571361e-02 ...  5.07120136e-03
   -9.18035582e-03 -7.84869771e-03]
  [ 2.99939029e-02  2.72923280e-02  4.18264326e-03 ...  1.57208573e-02
   -1.68871526e-02 -3.01272538e-03]]

 [[-5.26845502e-03 -7.46474368e-03 -1.49262454e-02 ...  1.92249212e-02
   -2.65123993e-02 -5.84297115e-03]
  [ 2.59523652e-03  1.70633532e-04 -6.06783386e-03 ...  1.71769988e-02
   -4.06142361e-02 -8.70768074e-03]
  [-1.69234851e-03 -4.12062509e-03 -5.39701944e-03 ...  1.94731969e-02
   -3.56496237e-02 -4.516

In [None]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)
np.transpose(causal_attention_mask(1, 10, 10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int32)

In [None]:
class TransformerBlock(Layer):
  def __init__(self, heads, embedding_dim, dense_1_neurons, dropout_rate, **kwargs):
    super(TransformerBlock, self).__init__(**kwargs)

    self.dropout_rate = dropout_rate

    # Multi-head attention layer
    self.multi_head_attention = MultiHeadAttention(heads, embedding_dim)

    # FFN layers
    self.dense_1 = Dense(dense_1_neurons, activation='relu')
    self.dense_2 = Dense(embedding_dim)

    # Normalization layers
    self.layer_norm_1 = LayerNormalization(epsilon=1e-6)
    self.layer_norm_2 = LayerNormalization(epsilon=1e-6)

    # Dropout layers
    self.dropout_1 = Dropout(self.dropout_rate)
    self.dropout_2 = Dropout(self.dropout_rate)

  def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
    i = range(n_dest)[:, None]
    j = range(n_src)
    m = i >= j - n_src + n_dest
    mask = cast(m, dtype)
    mask = reshape(mask, [1, n_dest, n_src])
    mult = concat(
        [expand_dims(batch_size, -1), constant([1, 1], dtype=int32)], 0
    )
    return tile(mask, mult)


  def call(self, x):

    input_shape = shape(x)
    batch_size = input_shape[0]
    seq_len = input_shape[1]
    causal_mask = self.causal_attention_mask(
        batch_size, seq_len, seq_len, bool
    )

    # Multi-Head Attention
    attention = self.multi_head_attention(x, x, x, mask=causal_mask)

    # Dropout
    out_drop_1 = self.dropout_1(attention)

    # Residual connection + Layer normalization
    res_1 = x + out_drop_1
    out_ln_1 = self.layer_norm_1(res_1)

    # FFN layers
    out_dense_1 = self.dense_1(out_ln_1)
    out_dense_2 = self.dense_2(out_dense_1)

    # Dropout
    out_drop_2 = self.dropout_2(out_dense_2)

    # Residual connection + Layer normalization
    res_2 = out_ln_1 + out_drop_2
    output = self.layer_norm_2(res_2)

    return output

In [None]:
# Input layer
inputs = Input(shape=(None,), dtype=tf.int32)

# Positional Embedding
out_pos = PositionalEmbedding(
            vocab_size=VOCAB_SIZE,
            embedding_dim=EMBEDDING_DIM,
            max_unique_pos_embed=MAX_LEN_UNIQUE_POS_EMBED
        )(inputs)

# Transformer block
out_tf = TransformerBlock(
            heads=HEADS,
            embedding_dim=EMBEDDING_DIM,
            dense_1_neurons=DENSE_1_NEURONS,
            dropout_rate=DROP_RATE
        )(out_pos)

# Output layer
outputs = Dense(VOCAB_SIZE, activation="softmax")(out_tf)

transformer_model = Model(inputs=inputs, outputs=outputs)
transformer_model.compile("adam", loss=[tf.keras.losses.SparseCategoricalCrossentropy(), None])

## 4. Train model

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.weights.h5",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

In [None]:
hist = transformer_model.fit(
    train_dataset,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback]
)

Epoch 1/5
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4928s[0m 157ms/step - loss: 0.6567
Epoch 2/5
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4859s[0m 155ms/step - loss: 0.5870
Epoch 3/5
[1m  672/31250[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:18:42[0m 154ms/step - loss: 0.5970

KeyboardInterrupt: 

In [None]:
transformer_model.save(MODEL_PATH)

## 5. Test

In [None]:
class ReviewGenerator():
  def __init__(self, model, vocab):
    self.vocab = vocab
    self.model = model
    self.word_to_index = {
            word: index for index, word in enumerate(vocab)
        }

  def get_next_token(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

  def generate(self, start_prompt, max_tokens, temperature):

    start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]

    next_predicted_token = None
    generated_text = ""
    while len(start_tokens) < max_tokens and next_predicted_token != 0:

      x = np.array([start_tokens])
      y_pred, _ = self.model.predict(x, verbose=0)

      next_predicted_token, probs = self.get_next_token(y_pred[0][-1], temperature)
      start_tokens.append(next_predicted_token)

      generated_text += self.vocab[next_predicted_token] + " "

    return generated_text

review_generator = ReviewGenerator(transformer_model, vocab)

In [None]:
prompt = "movie review for Adrift |"
generated_text = review_generator.generate(prompt, max_tokens=70, temperature=0.9)

print(f"Promp: {prompt}")
print(f"Generated review: {generated_text}")

Promp: movie review for Adrift |
Generated review: it ' s fitting that the actors lead to [UNK] the film with flashy acting highlights , yet doesn ' t seem far too subtle , beneath the surface , and no human [UNK] .  
