In [1]:
## import
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import convert_to_tensor, string
from tensorflow.keras.layers import TextVectorization, Embedding, Layer,Dense
from tensorflow.data import Dataset
from tensorflow.keras import Model,Input

In [2]:
output_sequence_length = 5
vocab_size = 10
sentences = [["I am a robot"], ["you too robot"]]
sentence_data = Dataset.from_tensor_slices(sentences)

In [3]:
sentence_data

<_TensorSliceDataset element_spec=TensorSpec(shape=(1,), dtype=tf.string, name=None)>

In [4]:
# Create the TextVectorization layer
vectorize_layer=TextVectorization(max_tokens=vocab_size,output_sequence_length=output_sequence_length)
# Train the layer to create a dictionary
vectorize_layer.adapt(sentence_data)
# Convert all sentences to tensors
word_tensors = convert_to_tensor(sentences, dtype=tf.string)

In [5]:
# Use the word tensors to get vectorized phrases
vectorized_words = vectorize_layer(word_tensors)
print("Vocabulary: ", vectorize_layer.get_vocabulary())
print("Vectorized words: ", vectorized_words)

Vocabulary:  ['', '[UNK]', 'robot', 'you', 'too', 'i', 'am', 'a']
Vectorized words:  tf.Tensor(
[[5 6 7 2 0]
 [3 4 2 0 0]], shape=(2, 5), dtype=int64)


In [6]:
## embedding layer
output_length = 6
word_embedding_layer=Embedding(vocab_size,output_length)
embedded_words=word_embedding_layer(vectorized_words)
print(embedded_words)

tf.Tensor(
[[[ 0.01802111 -0.03655459  0.04699257 -0.03549253 -0.03372033
   -0.04348135]
  [-0.03627764 -0.00324202  0.03054514 -0.01772243 -0.00555078
   -0.03975026]
  [-0.04530586 -0.01655366  0.01745584  0.03302291  0.02491238
   -0.02427235]
  [ 0.04008334 -0.0498448   0.00774311  0.01157374  0.03944308
    0.03408824]
  [-0.0132429  -0.01852164 -0.00074568 -0.00759749 -0.02971439
   -0.04123703]]

 [[ 0.02138865  0.0447805  -0.00412636  0.01468355 -0.04517774
    0.02160535]
  [-0.04765559  0.04301833  0.04237645 -0.04278549  0.00940941
    0.03539101]
  [ 0.04008334 -0.0498448   0.00774311  0.01157374  0.03944308
    0.03408824]
  [-0.0132429  -0.01852164 -0.00074568 -0.00759749 -0.02971439
   -0.04123703]
  [-0.0132429  -0.01852164 -0.00074568 -0.00759749 -0.02971439
   -0.04123703]]], shape=(2, 5, 6), dtype=float32)


In [7]:
## postional embedding
position_embedding_layer=Embedding(output_sequence_length,output_length)
position_indices=tf.range(output_sequence_length)
position_embeddings=position_embedding_layer(position_indices)
print(position_embeddings)

tf.Tensor(
[[ 0.03769987 -0.00326257  0.04162018  0.00956613  0.03597886 -0.03289197]
 [-0.02051154 -0.03148699 -0.03174626  0.00295677 -0.01347339  0.00110074]
 [-0.04847406  0.03707666  0.02330916  0.00303983 -0.02287695 -0.01452228]
 [-0.02172968  0.01392375 -0.04243938  0.02937079 -0.00286959  0.02439983]
 [ 0.02124126  0.01907993  0.04998076  0.01471615  0.01924132  0.01740256]], shape=(5, 6), dtype=float32)


In [8]:
final_output_embedding=embedded_words+position_embeddings
print(final_output_embedding)

tf.Tensor(
[[[ 0.05572098 -0.03981715  0.08861275 -0.0259264   0.00225853
   -0.07637332]
  [-0.05678918 -0.034729   -0.00120112 -0.01476566 -0.01902417
   -0.03864951]
  [-0.09377992  0.02052299  0.040765    0.03606273  0.00203543
   -0.03879462]
  [ 0.01835365 -0.03592105 -0.03469627  0.04094453  0.03657348
    0.05848807]
  [ 0.00799836  0.00055828  0.04923509  0.00711866 -0.01047307
   -0.02383448]]

 [[ 0.05908852  0.04151793  0.03749382  0.02424968 -0.00919888
   -0.01128662]
  [-0.06816714  0.01153134  0.01063019 -0.03982872 -0.00406399
    0.03649175]
  [-0.00839072 -0.01276815  0.03105227  0.01461357  0.01656613
    0.01956597]
  [-0.03497259 -0.00459789 -0.04318506  0.02177329 -0.03258399
   -0.01683721]
  [ 0.00799836  0.00055828  0.04923509  0.00711866 -0.01047307
   -0.02383448]]], shape=(2, 5, 6), dtype=float32)


In [9]:
## positional embedding for sub class
class PositionEmbeddingLayer(Layer):
  def __init__(self, sequence_length,vocab_size ,output_dim, **kwargs):
    super().__init__(**kwargs)
    self.wordembedding=Embedding(vocab_size,output_dim)
    self.positionembedding=Embedding(sequence_length,output_dim)

  def call(self,inputs):
    position=tf.range(tf.shape(inputs)[-1])
    wmbadding_word=self.wordembedding(inputs)
    position_embedding=self.positionembedding(position)
    return wmbadding_word+position_embedding

In [10]:
my_embedding_layer = PositionEmbeddingLayer(output_sequence_length,
vocab_size, output_length)
embedded_layer_output = my_embedding_layer(vectorized_words)
print("Output from my_embedded_layer: ", embedded_layer_output)

Output from my_embedded_layer:  tf.Tensor(
[[[-0.02115345  0.04130647  0.03703581  0.00608994  0.06194867
    0.07104433]
  [ 0.0491291   0.01616457 -0.04448112  0.08892301  0.01295964
    0.02418089]
  [-0.02373423 -0.05806943 -0.0478638   0.01348039 -0.01520481
   -0.04423137]
  [-0.00225726 -0.06576176 -0.0097317   0.06734931 -0.02840799
   -0.0123294 ]
  [-0.00975649  0.00876598  0.01595427  0.0734329   0.03291627
   -0.0702059 ]]

 [[-0.00794388  0.047996    0.01193348 -0.05446617  0.00362823
    0.05957271]
  [ 0.04448646 -0.03431307 -0.00950992  0.0209864  -0.02023845
   -0.04165429]
  [-0.01525407 -0.06365989 -0.01925647  0.01484144 -0.02852146
   -0.04437854]
  [-0.03693442  0.0050489  -0.00548879  0.08491301  0.01095913
   -0.01831951]
  [-0.00975649  0.00876598  0.01595427  0.0734329   0.03291627
   -0.0702059 ]]], shape=(2, 5, 6), dtype=float32)


In [11]:
class PositionEmbeddingFixedWeights(Layer):
  def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
    super().__init__(**kwargs)
    word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
    pos_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
    self.word_embedding_layer = Embedding(
    input_dim=vocab_size, output_dim=output_dim,
    weights=[word_embedding_matrix],
    trainable=False
    )
    self.position_embedding_layer = Embedding(
    input_dim=sequence_length, output_dim=output_dim,
    weights=[pos_embedding_matrix],
    trainable=False
    )



  def get_position_encoding(self, seq_len, d, n=10000):
    P=np.zeros((seq_len, d))
    for k in range(seq_len):
        for i in np.arange(int(d/2)):
            denominator = np.power(n, 2*i/d)
            P[k, 2*i] = np.sin(k/denominator)
            P[k, 2*i+1] = np.cos(k/denominator)
    return P
  def call(self, inputs):
    position_indices = tf.range(tf.shape(inputs)[-1])
    embedded_words = self.word_embedding_layer(inputs)
    embedded_indices = self.position_embedding_layer(position_indices)
    return embedded_words + embedded_indices

In [12]:
attnisallyouneed_embedding = PositionEmbeddingFixedWeights(output_sequence_length,
vocab_size, output_length)
attnisallyouneed_output = attnisallyouneed_embedding(vectorized_words)
print("Output from my_embedded_layer: ", attnisallyouneed_output)

Output from my_embedded_layer:  tf.Tensor(
[[[-0.9589243   1.2836622   0.23000172  1.9731903   0.01077196
    1.9999421 ]
  [ 0.56205547  1.5004725   0.3213085   1.9603932   0.01508068
    1.9999142 ]
  [ 1.566284    0.3377554   0.41192317  1.9433732   0.01938933
    1.999877  ]
  [ 1.0504174  -1.4061394   0.2314966   1.9860148   0.01077211
    1.9999698 ]
  [-0.7568025   0.3463564   0.18459873  1.982814    0.00861763
    1.9999628 ]]

 [[ 0.14112     0.0100075   0.1387981   1.9903207   0.00646326
    1.9999791 ]
  [ 0.08466846 -0.11334133  0.23099795  1.9817369   0.01077207
    1.9999605 ]
  [ 1.8185948  -0.8322937   0.185397    1.9913884   0.00861771
    1.9999814 ]
  [ 0.14112     0.0100075   0.1387981   1.9903207   0.00646326
    1.9999791 ]
  [-0.7568025   0.3463564   0.18459873  1.982814    0.00861763
    1.9999628 ]]], shape=(2, 5, 6), dtype=float32)


In [13]:
## scale dot product attention
from tensorflow import matmul, math, cast, float32
from tensorflow.keras.layers import Layer
from keras.backend import softmax
# Implementing the Scaled-Dot Product Attention
class DotProductAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, queries, keys, values, d_k, mask=None):
    # Scoring the queries against the keys after transposing the latter, and scaling\

        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
        # Apply mask to the attention scores
        if mask is not None:
            scores += -1e9 * mask
        # Computing the weights by a softmax operation
        weights = softmax(scores)
        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)

In [14]:
from numpy import random
input_seq_length = 5 # Maximum length of the input sequence
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
batch_size = 64 # Batch size from the training process
queries = random.random((batch_size, input_seq_length, d_k))
keys = random.random((batch_size, input_seq_length, d_k))
values = random.random((batch_size, input_seq_length, d_v))
attention = DotProductAttention()
print(attention(queries, keys, values, d_k))

tf.Tensor(
[[[0.56642634 0.48043346 0.62086624 ... 0.49831706 0.6293844  0.47573417]
  [0.56624645 0.4665729  0.61928743 ... 0.49164605 0.6250012  0.4584907 ]
  [0.567589   0.48765612 0.626608   ... 0.49058715 0.6384107  0.48801014]
  [0.57554936 0.46686202 0.6245432  ... 0.4892829  0.62839735 0.46660364]
  [0.5682196  0.47088844 0.6196643  ... 0.4991469  0.6255311  0.46260226]]

 [[0.33131826 0.29962105 0.4115417  ... 0.37526932 0.68500924 0.3527864 ]
  [0.3313189  0.30002838 0.41044855 ... 0.37486857 0.6840202  0.35353592]
  [0.34111387 0.2972543  0.41706875 ... 0.37794158 0.68390334 0.34580135]
  [0.3496281  0.29785657 0.41336313 ... 0.37822    0.67424953 0.34633207]
  [0.3376245  0.29840437 0.40891314 ... 0.37774372 0.67477894 0.35149947]]

 [[0.3769011  0.5721493  0.4358466  ... 0.27507958 0.4288712  0.5873514 ]
  [0.37967443 0.57849514 0.41524684 ... 0.27226767 0.41850993 0.57074773]
  [0.36412197 0.5736416  0.43838695 ... 0.28644666 0.40918273 0.5637267 ]
  [0.38487422 0.5755711

In [15]:
attention(queries, keys, values, d_k).shape

TensorShape([64, 5, 64])

In [16]:
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32
from tensorflow.keras.layers import Dense, Layer
from tensorflow.keras.backend import softmax
# Implementing the Scaled-Dot Product Attention
class DotProductAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, queries, keys, values, d_k, mask=None):
        # Scoring the queries against the keys after transposing the latter, and scaling
        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
        # Apply mask to the attention scores
        if mask is not None:
             scores += -1e9 * mask
        # Computing the weights by a softmax operation
        weights = softmax(scores)
        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)


In [17]:
 # Implementing the Multi-Head Attention
class MultiHeadAttention(Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super().__init__(**kwargs)
        self.attention = DotProductAttention() # Scaled dot product attention
        self.heads = h # Number of attention heads to use
        self.d_k = d_k # Dimensionality of the linearly projected queries and keys
        self.d_v = d_v # Dimensionality of the linearly projected values
        self.d_model = d_model # Dimensionality of the model
        self.W_q = Dense(d_k)
        # Learned projection matrix for the queries
        self.W_k = Dense(d_k)
        # Learned projection matrix for the keys
        self.W_v = Dense(d_v)
        # Learned projection matrix for the values
        self.W_o = Dense(d_model) # Learned projection matrix for the multi-head output
    def reshape_tensor(self, x, heads, flag):
        if flag:
            # Tensor shape after reshaping and transposing:
            # (batch_size, heads, seq_length, -1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
            x = transpose(x, perm=(0, 2, 1, 3))
        else:
            # Reverting the reshaping and transposing operations:
            # (batch_size, seq_length, d_k)
            x = transpose(x, perm=(0, 2, 1, 3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k))
        return x
    def call(self, queries, keys, values, mask=None):
        # Rearrange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the values to be able to compute all heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Compute the multi-head attention output using the reshaped queries,
        # keys, and values
        o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, input_seq_length, d_v)
        # Apply one final linear projection to the output to generate the multi-head
        # attention. Resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)

In [18]:
input_seq_length = 5 # Maximum length of the input sequence
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of the model sub-layers' outputs
batch_size = 64 # Batch size from the training process
queries = random.random((batch_size, input_seq_length, d_k))
keys = random.random((batch_size, input_seq_length, d_k))
values = random.random((batch_size, input_seq_length, d_v))
multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
print(multihead_attention(queries, keys, values))

tf.Tensor(
[[[-2.29031779e-02  6.64517581e-02 -3.04333027e-02 ... -1.62157968e-01
   -2.60966092e-01  1.22116573e-01]
  [-2.39934009e-02  6.35017008e-02 -2.95974482e-02 ... -1.59681827e-01
   -2.63196468e-01  1.25780284e-01]
  [-2.39648297e-02  6.50642812e-02 -3.07038426e-02 ... -1.61196709e-01
   -2.61221588e-01  1.23951055e-01]
  [-2.17814632e-02  6.42109215e-02 -3.06741744e-02 ... -1.59833014e-01
   -2.62974173e-01  1.23221822e-01]
  [-2.19890848e-02  6.20770715e-02 -2.99076606e-02 ... -1.56092718e-01
   -2.61918038e-01  1.26674935e-01]]

 [[ 1.03055805e-01  1.61226377e-01 -6.43157512e-02 ... -2.22388551e-01
   -3.20952386e-01  6.74936846e-02]
  [ 9.98393893e-02  1.61328539e-01 -6.43872917e-02 ... -2.23210454e-01
   -3.18335086e-01  7.05329478e-02]
  [ 1.01821452e-01  1.61877424e-01 -6.32952750e-02 ... -2.21641123e-01
   -3.20943594e-01  6.91150948e-02]
  [ 1.01731636e-01  1.61925539e-01 -6.52155131e-02 ... -2.21485630e-01
   -3.20492476e-01  6.72967806e-02]
  [ 1.02103479e-01  1.61

In [19]:
from tensorflow.keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout

In [20]:
# Implementing the Add & Norm Layer
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layer_norm = LayerNormalization()
        # Layer normalization layer
    def call(self, x, sublayer_x):
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x
        # Apply layer normalization to the sum
        return self.layer_norm(add)


In [21]:
# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super().__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff) # First fully connected layer
        self.fully_connected2 = Dense(d_model) # Second fully connected layer
        self.activation = ReLU() # ReLU activation layer
    def call(self, x):
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)
        return self.fully_connected2(self.activation(x_fc1))

In [22]:
class EncoderLayer(Layer):
    def __init__(self, sequence_length,h, d_k, d_v, d_model, d_ff, rate, **kwargs):
       super().__init__(**kwargs)
       self.build(input_shape=[None, sequence_length, d_model])
       self.multihead_attention= MultiHeadAttention(h, d_k, d_v, d_model)
       self.dropout1=Dropout(rate)
       self.add_norm1=AddNormalization()
       self.feed_forward=FeedForward(d_ff,d_model)
       self.dropout2=Dropout(rate)
       self.add_norm2=AddNormalization()
       self.sequence_length = sequence_length
       self.d_model=d_model

    def build_graph(self):
       # this function print model summary
       input_layer = Input(shape=(self.sequence_length, self.d_model))
       return Model(inputs=[input_layer], outputs=self.call(input_layer, None, True))

    def cell(self,x, padding_mask, training):
      # Multi-head attention layer
      multihead_output=self.multihead_attention(x,x,x,padding_mask)
      # Add in a dropout layer
      multihead_output=self.dropout1(multihead_output,training=training)
      # Add & Norm layer
      add_norm1_output=self.add_norm1(x,multihead_output)

      # fully connected layer

      fully_connected_output=self.feed_forward(add_norm1_output)
      # Add in a dropout layer
      fully_connected_output=self.dropout2(fully_connected_output,training=training)
      # Add & Norm layer
      add_norm2_output=self.add_norm2(add_norm1_output,fully_connected_output)

      ## return output
      return add_norm2_output

In [23]:
# Implementing the Encoder
class Encoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,
    **kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size,
        d_model)
        self.dropout = Dropout(rate)
        self.encoder_layer = [EncoderLayer(sequence_length,h, d_k, d_v, d_model, d_ff, rate)
        for _ in range(n)]
    def call(self, input_sentence, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(input_sentence)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.encoder_layer):
           x = layer(x, padding_mask, training)
        return x

In [24]:
enc_vocab_size = 20 # Vocabulary size for the encoder
input_seq_length = 5 # Maximum length of the input sequence
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the encoder stack
batch_size = 64 # Batch size from the training process
dropout_rate = 0.1 # Frequency of dropping the input units in the dropout layers
input_seq = random.random((batch_size, input_seq_length))
encoder = Encoder(enc_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n,
dropout_rate)
print(encoder(input_seq, None, True))

tf.Tensor(
[[[ 0.00000000e+00  2.22222233e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  2.22222233e+00]
  [ 9.34967756e-01  1.71144700e+00  9.13173616e-01 ...  2.22222233e+00
    1.15181436e-04  2.22222233e+00]
  [ 1.01033056e+00  6.48725748e-01  1.04046082e+00 ...  2.22222233e+00
    2.30362872e-04  2.22222233e+00]
  [ 1.56800017e-01  1.11194458e-02  2.72317141e-01 ...  2.22222233e+00
    3.45544337e-04  2.22222233e+00]
  [-8.40891719e-01  3.84840459e-01 -7.30185390e-01 ...  2.22222209e+00
    4.60725743e-04  2.22222233e+00]]

 [[ 0.00000000e+00  2.22222233e+00  0.00000000e+00 ...  2.22222233e+00
    0.00000000e+00  2.22222233e+00]
  [ 9.34967756e-01  1.71144700e+00  9.13173616e-01 ...  2.22222233e+00
    1.15181436e-04  2.22222233e+00]
  [ 1.01033056e+00  6.48725748e-01  1.04046082e+00 ...  2.22222233e+00
    2.30362872e-04  2.22222233e+00]
  [ 1.56800017e-01  1.11194458e-02  2.72317141e-01 ...  2.22222233e+00
    3.45544337e-04  2.22222233e+00]
  [ 0.00000000e+00  3.84

In [25]:
## decoder


In [26]:
class DecoderLayer(Layer):
    def __init__(self,sequence_length, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super().__init__(**kwargs)
        self.build(input_shape=[None, sequence_length, d_model])
        self.multihead_attention1 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.multihead_attention2 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout3 = Dropout(rate)
        self.add_norm3 = AddNormalization()
        self.sequence_length = sequence_length
        self.d_model = d_model

    def build_graph(self):
      # print decoder model summary
        input_layer = Input(shape=(self.sequence_length, self.d_model))
        return Model(inputs=[input_layer],
        outputs=self.call(input_layer, input_layer, None, None, True))

    def call(self, x, encoder_output, lookahead_mask, padding_mask, training):
        # Multi-head attention layer
        multihead_output1 = self.multihead_attention1(x, x, x, lookahead_mask)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        multihead_output1 = self.dropout1(multihead_output1, training=training)
        # Followed by an Add & Norm layer
        addnorm_output1 = self.add_norm1(x, multihead_output1)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Followed by another multi-head attention layer
        multihead_output2 = self.multihead_attention2(addnorm_output1, encoder_output,
        encoder_output, padding_mask)
        # Add in another dropout layer
        multihead_output2 = self.dropout2(multihead_output2, training=training)
        # Followed by another Add & Norm layer
        addnorm_output2 = self.add_norm1(addnorm_output1, multihead_output2)
        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output2)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in another dropout layer
        feedforward_output = self.dropout3(feedforward_output, training=training)
        # Followed by another Add & Norm layer
        return self.add_norm3(addnorm_output2, feedforward_output)

In [27]:
# Implementing the Decoder
class Decoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,
    **kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size,
        d_model)
        self.dropout = Dropout(rate)
        self.decoder_layer = [DecoderLayer(sequence_length,h, d_k, d_v, d_model, d_ff, rate)
        for _ in range(n)]
    def call(self, output_target, encoder_output, lookahead_mask, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(output_target)
        # Expected output shape = (number of sentences, sequence_length, d_model)
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.decoder_layer):
            x = layer(x, encoder_output, lookahead_mask, padding_mask, training)
        return x

In [28]:
dec_vocab_size = 20 # Vocabulary size for the decoder
input_seq_length = 5 # Maximum length of the input sequence
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the decoder stack
batch_size = 64
# Batch size from the training process
dropout_rate = 0.1
# Frequency of dropping the input units in the dropout layers
input_seq = random.random((batch_size, input_seq_length))
enc_output = random.random((batch_size, input_seq_length, d_model))
decoder = Decoder(dec_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n,
dropout_rate)
print(decoder(input_seq, enc_output, None, True))

tf.Tensor(
[[[ 0.51025665  0.79452646  0.33135727 ... -1.1658477   1.7707444
    0.3043663 ]
  [ 0.6210786   0.7615581   0.4284633  ... -1.1828421   1.766679
    0.31316793]
  [ 0.6871866   0.67443043  0.45655155 ... -1.2032787   1.7467706
    0.28913987]
  [ 0.6554414   0.6313329   0.3906897  ... -1.2289406   1.752642
    0.24422693]
  [ 0.5624084   0.6528156   0.31103334 ... -1.2239923   1.775493
    0.2363835 ]]

 [[ 0.6002505   0.6596829   0.05162104 ... -0.6283706   1.3787189
    0.82617635]
  [ 0.71797377  0.6306014   0.15302037 ... -0.65532714  1.3848516
    0.810952  ]
  [ 0.78150415  0.5365808   0.18311563 ... -0.6957038   1.3752369
    0.7773768 ]
  [ 0.7480451   0.47183508  0.11130506 ... -0.7251461   1.3751017
    0.74939823]
  [ 0.6560516   0.4705173   0.01422885 ... -0.7241089   1.3933616
    0.7339663 ]]

 [[ 0.524349    0.7465668   0.26670966 ... -0.9327688   1.6288153
    0.6124896 ]
  [ 0.6462786   0.7280103   0.38008657 ... -0.9600306   1.6343696
    0.62072647]
  [ 

In [29]:
decoder(input_seq, enc_output, None, True).shape

TensorShape([64, 5, 512])

In [30]:
from tensorflow import math, cast, float32 ,linalg, ones,maximum, newaxis

In [31]:
# ## combain encoder and decoder model
class TransformerModel(Model):
    def __init__(self, enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length,
                 h, d_k, d_v, d_model, d_ff_inner, n, rate, **kwargs):
        super().__init__(**kwargs)
        # Set up the encoder
        self.encoder = Encoder(enc_vocab_size, enc_seq_length, h, d_k, d_v,
        d_model, d_ff_inner, n, rate)
        self.decoder = Decoder(dec_vocab_size, dec_seq_length, h, d_k, d_v,
        d_model, d_ff_inner, n, rate)
        # Define the final dense layer
        self.model_last_layer = Dense(dec_vocab_size)


    def padding_mask(self, input):

        # Create mask which marks the zero padding values in the input by a 1.0
        mask = math.equal(input, 0)
        mask = cast(mask, float32)
        # The shape of the mask should be broadcastable to the shape
        # of the attention weights that it will be masking later on
        return mask[:, newaxis, newaxis, :]


    def lookahead_mask(self, shape):

        # Mask out future entries by marking them with a 1.0
        mask = 1 - linalg.band_part(ones((shape, shape)), -1, 0)
        return mask


    def call(self, encoder_input, decoder_input, training):

        # Create padding mask to mask the encoder inputs and the encoder
        # outputs in the decoder
        enc_padding_mask = self.padding_mask(encoder_input)
        # Create and combine padding and look-ahead masks to be fed into the decoder
        dec_in_padding_mask = self.padding_mask(decoder_input)
        dec_in_lookahead_mask = self.lookahead_mask(decoder_input.shape[1])
        dec_in_lookahead_mask = maximum(dec_in_padding_mask, dec_in_lookahead_mask)
        # Feed the input into the encoder
        encoder_output = self.encoder(encoder_input, enc_padding_mask, training)
        # Feed the encoder output into the decoder
        decoder_output = self.decoder(decoder_input, encoder_output,
        dec_in_lookahead_mask, enc_padding_mask, training)
        # Pass the decoder output through a final dense layer
        model_output = self.model_last_layer(decoder_output)
        return model_output

In [32]:
enc_vocab_size = 20 # Vocabulary size for the encoder
dec_vocab_size = 20 # Vocabulary size for the decoder
enc_seq_length = 5
dec_seq_length = 5
# Maximum length of the input sequence
# Maximum length of the target sequence
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the encoder stack
dropout_rate = 0.1
# Frequency of dropping the input units in the dropout layers

# Create model
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length,
dec_seq_length, h, d_k, d_v, d_model, d_ff, n,
dropout_rate)

In [33]:
## model summary for encoder

encoder = EncoderLayer(enc_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate)
encoder.build_graph().summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 5, 512)]          0         
                                                                 
Total params: 0 (0.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
## decoder model summary

decoder = DecoderLayer(dec_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate)
decoder.build_graph().summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 5, 512)]             0         []                            
                                                                                                  
 multi_head_attention_38 (M  (None, 5, 512)               131776    ['input_2[0][0]',             
 ultiHeadAttention)                                                  'input_2[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 dropout_66 (Dropout)        (None, 5, 512)               0         ['multi_head_attention_38[0][0
                                                                    ]']                     

In [35]:
## train our model

## data pre processesing
from pickle import load
from numpy.random import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor, int64


In [36]:
class PrepareDataset:
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.n_sentences = 10000 # Number of sentences to include in the dataset
        self.train_split = 0.9 # Ratio of the training data split
    # Fit a tokenizer
    def create_tokenizer(self, dataset):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(dataset)
        return tokenizer

    # Find the maximum length of the sequences
    def find_seq_length(self, dataset):
          return max(len(seq.split()) for seq in dataset)

    # Find the vocabulary size
    def find_vocab_size(self, tokenizer, dataset):
        tokenizer.fit_on_texts(dataset)
        return len(tokenizer.word_index) + 1


    def __call__(self, filename, **kwargs):
        # Load a clean dataset
        clean_dataset = load(open(filename, 'rb'))
        # Reduce dataset size
        dataset = clean_dataset[:self.n_sentences, :]
        # Include start and end of string tokens
        for i in range(dataset[:, 0].size):
            dataset[i, 0] = "<START> " + dataset[i, 0] + " <EOS>"
            dataset[i, 1] = "<START> " + dataset[i, 1] + " <EOS>"
        # Random shuffle the dataset
        shuffle(dataset)
        # Split the dataset
        train = dataset[:int(self.n_sentences * self.train_split)]
        # Prepare tokenizer for the encoder input
        enc_tokenizer = self.create_tokenizer(train[:, 0])
        enc_seq_length = self.find_seq_length(train[:, 0])
        enc_vocab_size = self.find_vocab_size(enc_tokenizer, train[:, 0])
        # Encode and pad the input sequences
        trainX = enc_tokenizer.texts_to_sequences(train[:, 0])
        trainX = pad_sequences(trainX, maxlen=enc_seq_length, padding='post')
        trainX = convert_to_tensor(trainX, dtype=int64)
        # Prepare tokenizer for the decoder input
        dec_tokenizer = self.create_tokenizer(train[:, 1])
        dec_seq_length = self.find_seq_length(train[:, 1])
        dec_vocab_size = self.find_vocab_size(dec_tokenizer, train[:, 1])
        # Encode and pad the input sequences
        trainY = dec_tokenizer.texts_to_sequences(train[:, 1])
        trainY = pad_sequences(trainY, maxlen=dec_seq_length, padding='post')
        trainY = convert_to_tensor(trainY, dtype=int64)
        return (trainX, trainY, train, enc_seq_length, dec_seq_length,
        enc_vocab_size, dec_vocab_size)

In [38]:
# Prepare the training data
dataset = PrepareDataset()
trainX, trainY, train_orig, enc_seq_length, dec_seq_length, \
enc_vocab_size, dec_vocab_size = dataset('drive/MyDrive/eng_ger.pkl')

In [39]:
print(train_orig[0, 0], '\n', trainX[0, :])

<START> theyre with me <EOS> 
 tf.Tensor([  1 146 135  11   2   0   0], shape=(7,), dtype=int64)


In [40]:
trainX.shape

TensorShape([9000, 7])

In [41]:
trainY.shape

TensorShape([9000, 11])

In [42]:
enc_seq_length

7

In [43]:
dec_seq_length

11

In [44]:
enc_vocab_size

2296

In [45]:
dec_vocab_size

3654

In [49]:
#traning
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.metrics import Mean
from tensorflow import data, train, math, reduce_sum, cast, equal, argmax, \
float32, GradientTape, TensorSpec, function, int64
from tensorflow.keras.losses import sparse_categorical_crossentropy

from time import time

[31mERROR: Could not find a version that satisfies the requirement model (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for model[0m[31m
[0m

In [56]:
# Define the model parameters
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers' outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack
# Define the training parameters
epochs = 5
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1
# Implementing a learning rate scheduler
class LRScheduler(LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000, **kwargs):
        super().__init__(**kwargs)
        self.d_model = cast(d_model, float32)
        self.warmup_steps = warmup_steps
    def __call__(self, step_num):
        # Linearly increasing the learning rate for the first warmup_steps, and
        # decreasing it thereafter
        step_num = cast(step_num, float32)
        arg1 = step_num ** -0.5
        arg2 = step_num * (self.warmup_steps ** -1.5)
        return (self.d_model ** -0.5) * math.minimum(arg1, arg2)

In [57]:
optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)
# Prepare the training and test splits of the dataset
dataset = PrepareDataset()
trainX, trainY, train_orig, enc_seq_length, dec_seq_length, \
enc_vocab_size, dec_vocab_size = dataset('drive/MyDrive/eng_ger.pkl')
# Prepare the dataset batches
train_dataset = data.Dataset.from_tensor_slices((trainX, trainY))
train_dataset = train_dataset.batch(batch_size)
# Create model
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length,
dec_seq_length, h, d_k, d_v, d_model, d_ff, n,
dropout_rate)

# Defining the loss function
def loss_fcn(target, prediction):
    # Create mask so that the zero padding values are not included in the
    # computation of loss
    mask = math.logical_not(equal(target, 0))
    mask = cast(mask, float32)
    # Compute a sparse categorical cross-entropy loss on the unmasked values
    loss = sparse_categorical_crossentropy(target, prediction, from_logits=True) * mask
    # Compute the mean loss over the unmasked values
    return reduce_sum(loss) / reduce_sum(mask)

# Defining the accuracy function
def accuracy_fcn(target, prediction):
    # Create mask so that the zero padding values are not included in the
    # computation of accuracy
    mask = math.logical_not(equal(target, 0))

    # Find equal prediction and target values, and apply the padding mask
    accuracy = equal(target, argmax(prediction, axis=2))

    accuracy = math.logical_and(mask, accuracy)

    # Cast the True/False values to 32-bit-precision floating-point numbers
    mask = cast(mask, float32)

    accuracy = cast(accuracy, float32)

    # Compute the mean accuracy over the unmasked values
    return reduce_sum(accuracy) / reduce_sum(mask)

# Include metrics monitoring
train_loss = Mean(name='train_loss')
train_accuracy = Mean(name='train_accuracy')
# Create a checkpoint object and manager to manage multiple checkpoints
ckpt = train.Checkpoint(model=training_model, optimizer=optimizer)
ckpt_manager = train.CheckpointManager(ckpt, "./checkpoints", max_to_keep=3)

In [58]:
# Speeding up the training process
@function
def train_step(encoder_input, decoder_input, decoder_output):
    with GradientTape() as tape:

        # Run the forward pass of the model to generate a prediction
        prediction = training_model(encoder_input, decoder_input, training=True)

        # Compute the training loss
        loss = loss_fcn(decoder_output, prediction)

        # Compute the training accuracy
        accuracy = accuracy_fcn(decoder_output, prediction)

    # Retrieve gradients of the trainable variables with respect to the training loss
    gradients = tape.gradient(loss, training_model.trainable_weights)

    # Update the values of the trainable variables by gradient descent
    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))
    train_loss(loss)
    train_accuracy(accuracy)


for epoch in range(epochs):
    train_loss.reset_states()

    train_accuracy.reset_states()

    print("\nStart of epoch %d" % (epoch + 1))

    start_time = time()

    # Iterate over the dataset batches
    for step, (train_batchX, train_batchY) in enumerate(train_dataset):
    # Define the encoder and decoder inputs, and the decoder output
        encoder_input = train_batchX[:, 1:]

        decoder_input = train_batchY[:, :-1]

        decoder_output = train_batchY[:, 1:]
        train_step(encoder_input, decoder_input, decoder_output)

        if step % 50 == 0:
            print(f"Epoch {epoch+1} Step {step} Loss {train_loss.result():.4f} "
            + f"Accuracy {train_accuracy.result():.4f}")

    # Print epoch number and loss value at the end of every epoch
    print(f"Epoch {epoch+1}: Training Loss {train_loss.result():.4f}, "
    + f"Training Accuracy {train_accuracy.result():.4f}")

    # Save a checkpoint after every five epochs
    if (epoch + 1) % 5 == 0:
        save_path = ckpt_manager.save()
        print(f"Saved checkpoint at epoch {epoch+1}")

print("Total time taken: %.2fs" % (time() - start_time))


Start of epoch 1
Epoch 1 Step 0 Loss 8.3762 Accuracy 0.0000
Epoch 1 Step 50 Loss 7.9002 Accuracy 0.0982
Epoch 1 Step 100 Loss 7.2666 Accuracy 0.1585
Epoch 1: Training Loss 6.9172, Training Accuracy 0.1792

Start of epoch 2
Epoch 2 Step 0 Loss 5.8459 Accuracy 0.2584
Epoch 2 Step 50 Loss 5.4896 Accuracy 0.2691
Epoch 2 Step 100 Loss 5.3116 Accuracy 0.2787
Epoch 2: Training Loss 5.1874, Training Accuracy 0.2850

Start of epoch 3
Epoch 3 Step 0 Loss 4.8855 Accuracy 0.2819
Epoch 3 Step 50 Loss 4.6786 Accuracy 0.3119
Epoch 3 Step 100 Loss 4.5753 Accuracy 0.3258
Epoch 3: Training Loss 4.5005, Training Accuracy 0.3353

Start of epoch 4
Epoch 4 Step 0 Loss 4.4162 Accuracy 0.3423
Epoch 4 Step 50 Loss 4.1786 Accuracy 0.3734
Epoch 4 Step 100 Loss 4.1084 Accuracy 0.3803
Epoch 4: Training Loss 4.0497, Training Accuracy 0.3864

Start of epoch 5
Epoch 5 Step 0 Loss 4.0175 Accuracy 0.3926
Epoch 5 Step 50 Loss 3.8000 Accuracy 0.4146
Epoch 5 Step 100 Loss 3.7472 Accuracy 0.4201
Epoch 5: Training Loss 3.6