In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

In [73]:
class Inpu tEmbedding(tf.keras.layers.Layer):
    
    def __init__(self,vocab_size,emb_size,input_length):
        
        
        super().__init__()
        self.emb_size = emb_size
        self.vocab_size= vocab_size
        
        self.input_emb = Embedding(
                        input_dim = self.vocab_size,
                        output_dim = self.emb_size,input_length = input_length
                                  )
    
    
    def __call__(self,x):
       
        return  self.input_emb(x)

In [74]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,batch_size,seq_len,emb_size):
        
        positions = np.arange(seq_len)[:,np.newaxis]
        depth = np.arange(emb_size)[np.newaxis, :]
        depth = (2*depth//2)/emb_size

        angle_rates = 1 / (10000**depth)

        angle_rads  = positions * angle_rates
        angle_rads[:,0::2] = np.sin(angle_rads[:,0::2])
        angle_rads[:,1::2] = np.sin(angle_rads[:,1::2])


        positions = positions * angle_rads
           
        self.pos = tf.constant(np.broadcast_to(positions,[batch_size,seq_len,emb_size]))
       
        
    
    def __call__(self):
        return self.pos
    

        
        
            
            

# Self  Attention

In [75]:
class MultiHeadAttention(tf.keras.layers.Layer):
    
    def __init__(self,emb_size,batch_size,heads,seq_len,mode='self'):
        super(MultiHeadAttention,self).__init__()
        
        
        """"
        Parameters:
                emb_size (int): Embedding size (e.g 512)
                batch_size (int): Batch Size
                heads (int): Number of heads (e.g 8)
                seq_len (int): Number of words in each sequence
                mode (str): self or mask -> attention
                
        Returns:
            Out (Tensor)
        
        """
        self.emb_size= emb_size
        self.heads = heads
        self.head_dim = emb_size//heads
        self.seq_len = seq_len
        self.batch_size = batch_size
        
        # Queries, Keys and Values Matrices Layers
        self.queries = tf.keras.layers.Dense(self.emb_size)
        self.keys = tf.keras.layers.Dense(self.emb_size)
        self.values = tf.keras.layers.Dense(self.emb_size)
        self.mode = mode
    
    
    def self_attenion(self,queries,keys,values):

        
        out = tf.zeros([self.batch_size,self.heads,self.seq_len,self.head_dim])
        out = tf.Variable(out)
        # apply attetion mechanism
        for b in range(self.batch_size):
            for h in range(self.heads):
                for i in range(self.seq_len):
                    # Calculating the scores by multiplying Query with key
                    
                    scores = tf.keras.layers.Dot(axes=-1)([queries[b,h,i,:][tf.newaxis,:], keys[b,h,:,:][tf.newaxis,:]])
                    
                    # Normalize the scores
                    scores = scores/np.sqrt(self.head_dim)
                 
                    # apply Softmax
                    z = tf.keras.activations.softmax(scores[tf.newaxis,:],axis=1)
                    z = tf.reshape(z,[self.seq_len,1])
                    
                    # Multiply the score with value vectors
                    value_vectors = tf.cast(values[b,h,:,:],dtype=tf.float32) * tf.cast(z,dtype=tf.float32)


                    out[b,h,i,:].assign(tf.reduce_sum(value_vectors,axis=0))

    
    
        return tf.reshape(out,(self.batch_size,self.seq_len,self.emb_size))
    
    
    def mask_attention(self,queries,keys,values):
        
        out = tf.zeros([self.batch_size,self.heads,self.seq_len,self.head_dim])
        out = tf.Variable(out)
        # apply attetion mechanism
        
        for b in range(self.batch_size):
            for h in range(self.heads):
                for i in range(self.seq_len):
                    # Calculating the scores by multiplying Query with key
                    scores = tf.keras.layers.Dot(axes=-1)([queries[b,h,i,:][tf.newaxis,:], keys[b,h,:,:][tf.newaxis,:]])
                   
                    if i<self.seq_len-1:
                        
                        scores = tf.reshape(scores,self.seq_len)
                        inf = [-np.inf for _ in range(i+1,scores.shape[-1])]
                        indices = [[j] for j in range(i+1, scores.shape[-1])]
                        indices = tf.constant(indices,dtype=tf.int32)
                        scores = tf.tensor_scatter_nd_update(scores,indices, inf)

                    

                    
                    # Normalize the scores
                    scores = scores/np.sqrt(self.head_dim)
                   
    #               apply Softmax
                    z = tf.keras.activations.softmax(tf.constant(scores)[tf.newaxis,:],axis=1)
                    z = tf.reshape(z,[self.seq_len,1])
                    
                    # Multiply the score with value vectors
                    
                    
                     # Multiply the score with value vectors
                    value_vectors = tf.cast(values[b,h,:,:],dtype=tf.float32) * tf.cast(z,dtype=tf.float32)
                    out[b,h,i,:].assign(tf.reduce_sum(value_vectors,axis=0))
                    

    

        return tf.reshape(out,(self.batch_size,self.seq_len,self.emb_size))
    
    def __call__(self,x,enc_key=[],enc_value=[]):
        
        
        # As mention in the paper first we multiply each word embedding in our case 512 with 512x512 Matrcis
        # We pass our data through the dense layer
        
        # For Multiheaded Attention
        if len(enc_key)==0 and len(enc_value)==0:
            queries = self.queries(x)
            keys = self.keys(x)
            values = self.values(x)
            
        else:
            
            # Multi Headed when keys and values come from encoder part
            queries = self.queries(x)
            keys = self.keys(enc_key)
            values = self.values(enc_value)
        

        queries = tf.reshape(queries,[self.batch_size,self.heads,self.seq_len,self.head_dim])
        keys = tf.reshape(keys,[self.batch_size,self.heads,self.seq_len,self.head_dim])
        values = tf.reshape(values,[self.batch_size,self.heads,self.seq_len,self.head_dim])
        
          
        if self.mode == 'self':
            
            # Self Attention
            attention = self.self_attenion(queries,keys,values)
            
        # Apply masked multiheaded attention
        if self.mode == 'mask':
            attention = self.mask_attention(queries,keys,values)

            
        # Last matrix 
        
        out = tf.keras.layers.Dense(self.emb_size)(attention)
                          
        return out
    
    

# Encoder Block

In [76]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,
               batch_size,
               seq_len,
               emb_size=512,
               heads=8,
               forward_expansion=4):
    
        super(Encoder,self).__init__()

        self.emb_size = emb_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.heads = heads
        self.expansion_rate = forward_expansion

        self.mha = MultiHeadAttention(
                                  self.emb_size,
                                  self.batch_size,
                                  self.heads,self.seq_len
                                      )
        self.dense_1 = tf.keras.layers.Dense(int(self.emb_size*self.expansion_rate),activation='relu')
        self.dense_2 = tf.keras.layers.Dense(self.emb_size,activation='relu')

        self.layer_norm = tf.keras.layers.LayerNormalization()


    def __call__(self,x):

        self_attention = self.mha(x)

        x = tf.keras.layers.Add()([x,self_attention])

        x = self.layer_norm(x)

        dense = self.dense_1(x)
        dense = self.dense_2(dense)

        x = tf.keras.layers.Add()([dense,x])
        x = self.layer_norm(x)


        return x

# Decoder

In [77]:
class Decoder(tf.keras.layers.Layer):
    
    def __init__(self,
               batch_size,
               seq_len,
               emb_size=512,
               heads=8,
               forward_expansion=4):
        
        
        super(Decoder,self).__init__()

        self.emb_size = emb_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.heads = heads
        self.expansion_rate = forward_expansion
        
        
        #Mask Multihead Attention
        self.causal_attention = MultiHeadAttention(self.emb_size,
                                                self.batch_size,
                                                self.heads,
                                                self.seq_len,
                                                mode = 'mask')
        
        # MultiHeaded Attention
        self.mha = MultiHeadAttention(self.emb_size,
                                      self.batch_size,
                                      self.heads,
                                      self.seq_len)
        
        
        self.dense_1 = tf.keras.layers.Dense(int(self.emb_size*self.expansion_rate),activation='relu')
        self.dense_2 = tf.keras.layers.Dense(self.emb_size,activation='relu')

        self.layer_norm = tf.keras.layers.LayerNormalization()
        
    def __call__(self,x,enc_key,enc_value):
        
       
        # Apply mask Attention
        mask_attention = self.causal_attention(x)
        
        x = tf.keras.layers.Add()([x,mask_attention])
        x = self.layer_norm(x)
        
        # Self Attention
        self_attention = self.mha(x,enc_key,enc_value)
        
        x = tf.keras.layers.Add()([x,self_attention])
        x = self.layer_norm(x)
        
        
        
        dense = self.dense_1(x)
        dense = self.dense_2(dense)
        
        x = tf.keras.layers.Add()([x,dense])
        x = self.layer_norm(x)
        
        return x
        

# Transformer

In [78]:
class Transformer(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 seq_len,
                 batch_size,
                 emb_size=512,
                 heads=8,
                 expansion_rate=4,
                 num_modules=6
                 ):
        
        
        super(Transformer,self).__init__()
        
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.heads = heads
        self.expansion_rate = expansion_rate
        self.num_modules = num_modules
        self.batch_size = batch_size
        self.seq_len = seq_len
        
        self.encoder_layers = [Encoder(batch_size,seq_len) for _ in range(num_modules)]
        
        self.decoder_layers = [Decoder(batch_size,seq_len) for _ in range(num_modules)]
        
        self.linear = tf.keras.layers.Dense(vocab_size,activation='softmax')
    
    def __call__(self,input1,input2):
        
        
        # Encoder part
        
        # input embeddings
        input_embeddings = InputEmbedding(self.vocab_size,self.emb_size,self.seq_len)(input1)
        
        #positional encoding
        
        positional_encodings = PositionalEmbedding(self.batch_size,self.seq_len,self.emb_size)()
        
        enc_out = tf.keras.layers.Add()([input_embeddings,positional_encodings])
        
#         print(f'Enc shape {enc_out.shape}')
        for layer in self.encoder_layers:
            enc_out = layer(enc_out)
            
        
        
        
        # Decoder Part
        
         # input embeddings
        input_embeddings = InputEmbedding(self.vocab_size,self.emb_size,self.seq_len)(input2)
        
        #positional encoding
        
        positional_encodings = PositionalEmbedding(self.batch_size,self.seq_len,self.emb_size)()
        
        dec_out = tf.keras.layers.Add()([input_embeddings,positional_encodings])
        
        for layer in self.decoder_layers:
            dec_out = layer(dec_out,enc_out,enc_out)
        
        
        # linear Layer
        
        out = self.linear(dec_out)
        
        
        
        return out
        
        
        
    

In [79]:
# Input Embedding

vocab_size = 1000
batch_size = 10
seq_len = 5


X = np.array([[0,0,1,0,1],[1,0,1,1,1],[1,0,1,0,1],[0,0,0,0,1],[0,0,1,0,0]\
             ,[0,1,1,0,1],[1,0,1,0,1],[0,1,1,0,1],[1,0,1,0,0],[0,0,1,1,1]])

target = np.array([[0,0,1,0,1],[1,0,1,1,1],[1,0,1,0,1],[0,0,0,0,1],[0,0,1,0,0]\
             ,[0,1,1,0,1],[1,0,1,0,1],[0,1,1,0,1],[1,0,1,0,0],[0,0,1,1,1]])


In [80]:
transformers = Transformer(vocab_size,seq_len,batch_size)(X,target)

In [81]:
transformers.shape

TensorShape([10, 5, 1000])