In [214]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

In [242]:
class InputEmbedding(tf.keras.layers.Layer):
    
    def __init__(self,vocab_size,emb_size,input_length):
        
        
        super().__init__()
        self.emb_size = emb_size
        self.vocab_size= vocab_size
        
        self.input_emb = Embedding(
                        input_dim = self.vocab_size,
                        output_dim = self.emb_size,input_length = input_length
                                  )
    
    
    def __call__(self,x):
       
        return  self.input_emb(x)

In [243]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,batch_size,seq_len,emb_size):
        
        positions = np.arange(seq_len)[:,np.newaxis]
        depth = np.arange(emb_size)[np.newaxis, :]
        depth = (2*depth//2)/emb_size

        angle_rates = 1 / (10000**depth)

        angle_rads  = positions * angle_rates
        angle_rads[:,0::2] = np.sin(angle_rads[:,0::2])
        angle_rads[:,1::2] = np.sin(angle_rads[:,1::2])


        positions = positions * angle_rads
           
        self.pos = tf.constant(np.broadcast_to(positions,[batch_size,seq_len,emb_size]))
       
        
   
    def __call__(self):
        return self.pos
    

        
        
            
            

In [244]:
class MultiHeadAttention(tf.keras.layers.Layer):
    
    def __init__(self,emb_size,batch_size,heads,seq_len,decoder=False,mode='self'):
        super(MultiHeadAttention,self).__init__()
        
        
        """"
        Parameters:
                emb_size (int): Embedding size (e.g 512)
                batch_size (int): Batch Size
                heads (int): Number of heads (e.g 8)
                seq_len (int): Number of words in each sequence
                mode (str): self or mask -> attention
                
        Returns:
            Out (Tensor)
        
        """
        self.emb_size= emb_size
        self.heads = heads
        self.head_dim = emb_size//heads
        self.seq_len = seq_len
        self.batch_size = batch_size
        
        # Queries, Keys and Values Matrices Layers
        self.queries = tf.keras.layers.Dense(self.emb_size)
        self.keys = tf.keras.layers.Dense(self.emb_size)
        self.values = tf.keras.layers.Dense(self.emb_size)
        self.mode = mode
        self.decoder = decoder
    
    
    def self_attention(self,queries,keys,values,masked=False):
        """
         queries: (batch_size,seq_len,dim)
         keys: (batch_size,seq_len,dim)
         values: (batch_size,seq_len,dim)
         
        """
        
        out = tf.matmul(queries,tf.transpose(keys,perm=[0, 2, 1]))
        out = out/np.sqrt(self.head_dim)

        if masked:
            mask = np.tril(np.ones((self.seq_len,self.seq_len)))

            mask[mask==0] = -np.inf
            out = out + mask
        
        
        out = tf.math.softmax(out)
        out = tf.matmul(out,values)
        
        return out 
    
     
    def __call__(self,x,enc_key=[],enc_value=[]):
        
        
        # As mention in the paper first we multiply each word embedding in our case 512 with 512x512 Matrcis
        # We pass our data through the dense layer
        
        # For Multiheaded Attention
        if self.decoder==False:
            queries = self.queries(x)
            keys = self.keys(x)
            values = self.values(x)
            
        else:
            
            # Multi Headed when keys and values come from encoder part
            queries = self.queries(x)
            keys = self.keys(enc_key)
            values = self.values(enc_value)
        


        
          
        if self.mode == 'self':
            
            # Self Attention
            attention = self.self_attention(queries,keys,values)
            
        # Apply masked multiheaded attention
        if self.mode == 'mask':
            attention = self.self_attention(queries,keys,values,masked=True)
        
        
            
        # Last matrix 
        
        out = tf.keras.layers.Dense(self.emb_size)(attention)
                          
        return out
    
    

In [245]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,
               batch_size,
               seq_len,
               emb_size=512,
               heads=8,
               forward_expansion=4):
    
        super(Encoder,self).__init__()

        self.emb_size = emb_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.heads = heads
        self.expansion_rate = forward_expansion

        self.mha = MultiHeadAttention(
                                  self.emb_size,
                                  self.batch_size,
                                  self.heads,self.seq_len
                                      )
        self.dense_1 = tf.keras.layers.Dense(int(self.emb_size*self.expansion_rate),activation='relu')
        self.dense_2 = tf.keras.layers.Dense(self.emb_size,activation='relu')

        self.layer_norm = tf.keras.layers.LayerNormalization()

  
    def __call__(self,x):

        self_attention = self.mha(x)

        x = tf.keras.layers.Add()([x,self_attention])

        x = self.layer_norm(x)

        dense = self.dense_1(x)
        dense = self.dense_2(dense)

        x = tf.keras.layers.Add()([dense,x])
        x = self.layer_norm(x)


        return x

In [246]:
class Decoder(tf.keras.layers.Layer):
    
    def __init__(self,
               batch_size,
               seq_len,
               emb_size=512,
               heads=8,
               forward_expansion=4):
        
        
        super(Decoder,self).__init__()

        self.emb_size = emb_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.heads = heads
        self.expansion_rate = forward_expansion
        
        
        #Mask Multihead Attention
        self.causal_attention = MultiHeadAttention(self.emb_size,
                                                self.batch_size,
                                                self.heads,
                                                self.seq_len,
                                                mode = 'mask')
        
        # MultiHeaded Attention
        self.mha = MultiHeadAttention(self.emb_size,
                                      self.batch_size,
                                      self.heads,
                                      self.seq_len,decoder=True)
        
        
        self.dense_1 = tf.keras.layers.Dense(int(self.emb_size*self.expansion_rate),activation='relu')
        self.dense_2 = tf.keras.layers.Dense(self.emb_size,activation='relu')

        self.layer_norm = tf.keras.layers.LayerNormalization()
      
    def __call__(self,x,enc_key,enc_value):
        
       
        # Apply mask Attention
        mask_attention = self.causal_attention(x)
        
        x = tf.keras.layers.Add()([x,mask_attention])
        x = self.layer_norm(x)
        
        # Self Attention
        self_attention = self.mha(x,enc_key,enc_value)
        
        x = tf.keras.layers.Add()([x,self_attention])
        x = self.layer_norm(x)
        
        
        
        dense = self.dense_1(x)
        dense = self.dense_2(dense)
        
        x = tf.keras.layers.Add()([x,dense])
        x = self.layer_norm(x)
        
        return x
        

In [247]:
class Transformer(tf.keras.layers.Layer):
    
    def __init__(self,
                 vocab_size,
                 seq_len,
                 batch_size,
                 emb_size=512,
                 heads=8,
                 expansion_rate=4,
                 num_modules=6
                 ):
        
        
        super(Transformer,self).__init__()
        
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.heads = heads
        self.expansion_rate = expansion_rate
        self.num_modules = num_modules
        self.batch_size = batch_size
        self.seq_len = seq_len
        
        self.encoder_layers = [Encoder(batch_size,seq_len) for _ in range(num_modules)]
        
        self.decoder_layers = [Decoder(batch_size,seq_len) for _ in range(num_modules)]
        
        self.linear = tf.keras.layers.Dense(vocab_size,activation='softmax')
   
    
    def __call__(self,input1,input2):
        
        
        # Encoder part
        
        # input embeddings
        input_embeddings = InputEmbedding(self.vocab_size,self.emb_size,self.seq_len)(input1)
        
        #positional encoding
        
        positional_encodings = PositionalEmbedding(self.batch_size,self.seq_len,self.emb_size)()
        
        enc_out = tf.keras.layers.Add()([input_embeddings,positional_encodings])
        print(enc_out.shape)

        for layer in self.encoder_layers:
            enc_out = layer(enc_out)
            
        
        
        
        # Decoder Part
        
         # input embeddings
        input_embeddings = InputEmbedding(self.vocab_size,self.emb_size,self.seq_len)(input2)
        
        #positional encoding
        
        positional_encodings = PositionalEmbedding(self.batch_size,self.seq_len,self.emb_size)()
        
        dec_out = tf.keras.layers.Add()([input_embeddings,positional_encodings])
        
        for layer in self.decoder_layers:
            dec_out = layer(dec_out,enc_out,enc_out)
        
        
        # linear Layer
        
        out = self.linear(dec_out)
        
        
        
        return out
        
        
        
    

In [248]:
# Input Embedding

vocab_size = 1000
batch_size = 10
seq_len = 5



X = np.array([[0,0,1,0,1],[1,0,1,1,1],[1,0,1,0,1],[0,0,0,0,1],[0,0,1,0,0]\
             ,[0,1,1,0,1],[1,0,1,0,1],[0,1,1,0,1],[1,0,1,0,0],[0,0,1,1,1]])

target = np.array([[0,0,1,0,1],[1,0,1,1,1],[1,0,1,0,1],[0,0,0,0,1],[0,0,1,0,0]\
             ,[0,1,1,0,1],[1,0,1,0,1],[0,1,1,0,1],[1,0,1,0,0],[0,0,1,1,1]])



In [249]:
input1 = tf.keras.Input(shape=(None,))
input2 = tf.keras.Input(shape=(None,))

transformer = Transformer(vocab_size=100, seq_len=5,batch_size=10)
out = transformer(input1, input2)
model = tf.keras.Model(inputs=[input1, input2], outputs=out)

(10, 5, 512)


In [250]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_55 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_22 (Embedding)       (None, None, 512)    51200       ['input_55[0][0]']               
                                                                                                  
 add_179 (Add)                  (10, 5, 512)         0           ['embedding_22[0][0]']           
                                                                                                  
 dense_2214 (Dense)             (10, 5, 512)         262656      ['add_179[0][0]']                
                                                                                            

 dense_2223 (Dense)             (10, 5, 512)         262656      ['layer_normalization_325[1][0]']
                                                                                                  
 tf.compat.v1.transpose_70 (TFO  (10, 512, 5)        0           ['dense_2224[0][0]']             
 pLambda)                                                                                         
                                                                                                  
 tf.linalg.matmul_140 (TFOpLamb  (10, 5, 5)          0           ['dense_2223[0][0]',             
 da)                                                              'tf.compat.v1.transpose_70[0][0]
                                                                 ']                               
                                                                                                  
 tf.math.truediv_70 (TFOpLambda  (10, 5, 5)          0           ['tf.linalg.matmul_140[0][0]']   
 )        

                                                                                                  
 tf.nn.softmax_72 (TFOpLambda)  (10, 5, 5)           0           ['tf.math.truediv_72[0][0]']     
                                                                                                  
 dense_2235 (Dense)             (10, 5, 512)         262656      ['layer_normalization_327[1][0]']
                                                                                                  
 tf.linalg.matmul_145 (TFOpLamb  (10, 5, 512)        0           ['tf.nn.softmax_72[0][0]',       
 da)                                                              'dense_2235[0][0]']             
                                                                                                  
 dense_2296 (Dense)             (10, 5, 512)         262656      ['tf.linalg.matmul_145[0][0]']   
                                                                                                  
 add_188 (

                                                                                                  
 dense_2242 (Dense)             (10, 5, 512)         1049088     ['dense_2241[0][0]']             
                                                                                                  
 dense_2298 (Dense)             (10, 5, 512)         262656      ['tf.linalg.matmul_149[0][0]']   
                                                                                                  
 add_191 (Add)                  (10, 5, 512)         0           ['dense_2242[0][0]',             
                                                                  'layer_normalization_329[0][0]']
                                                                                                  
 add_193 (Add)                  (10, 5, 512)         0           ['add_192[0][0]',                
                                                                  'dense_2298[0][0]']             
          

 dense_2254 (Dense)             (10, 5, 512)         262656      ['layer_normalization_331[0][0]']
                                                                                                  
 tf.compat.v1.transpose_77 (TFO  (10, 512, 5)        0           ['dense_2255[0][0]']             
 pLambda)                                                                                         
                                                                                                  
 tf.linalg.matmul_154 (TFOpLamb  (10, 5, 5)          0           ['dense_2254[0][0]',             
 da)                                                              'tf.compat.v1.transpose_77[0][0]
                                                                 ']                               
                                                                                                  
 tf.math.truediv_77 (TFOpLambda  (10, 5, 5)          0           ['tf.linalg.matmul_154[0][0]']   
 )        

 da)                                                              'dense_2264[0][0]']             
                                                                                                  
 dense_2303 (Dense)             (10, 5, 512)         262656      ['tf.linalg.matmul_159[0][0]']   
                                                                                                  
 add_200 (Add)                  (10, 5, 512)         0           ['layer_normalization_332[0][0]',
                                                                  'dense_2303[0][0]']             
                                                                                                  
 dense_2265 (Dense)             (10, 5, 2048)        1050624     ['layer_normalization_332[1][0]']
                                                                                                  
 dense_2266 (Dense)             (10, 5, 512)         1049088     ['dense_2265[0][0]']             
          

                                                                                                  
 tf.compat.v1.transpose_82 (TFO  (10, 512, 5)        0           ['dense_2276[0][0]']             
 pLambda)                                                                                         
                                                                                                  
 tf.linalg.matmul_164 (TFOpLamb  (10, 5, 5)          0           ['dense_2275[0][0]',             
 da)                                                              'tf.compat.v1.transpose_82[0][0]
                                                                 ']                               
                                                                                                  
 tf.math.truediv_82 (TFOpLambda  (10, 5, 5)          0           ['tf.linalg.matmul_164[0][0]']   
 )                                                                                                
          

                                                                                                  
 tf.linalg.matmul_169 (TFOpLamb  (10, 5, 512)        0           ['tf.nn.softmax_84[0][0]',       
 da)                                                              'dense_2285[0][0]']             
                                                                                                  
 dense_2308 (Dense)             (10, 5, 512)         262656      ['tf.linalg.matmul_169[0][0]']   
                                                                                                  
 add_208 (Add)                  (10, 5, 512)         0           ['layer_normalization_334[2][0]',
                                                                  'dense_2308[0][0]']             
                                                                                                  
 layer_normalization_335 (Layer  (10, 5, 512)        1024        ['add_208[0][0]',                
 Normaliza

In [251]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             metrics=['accuracy'])

In [253]:
model.fit([X,target],target,epochs=10,batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x141add3c9a0>