In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [14]:
import tensorflow as tf
from keras.models import *
from keras.layers import *
from keras.datasets import imdb
from keras.utils import pad_sequences

In [20]:
class TransformerBlock(Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate = 0.1,):
    # embed_dim: This parameter specifies the dimentionality of the
    # num_heas: This parameter controls the number of attentioni head
    # ff_dim: This parameter specifies the dimentionality of the feedforward network
    # ratw: This parameter controls the dropout rate, which is used to

    super().__init__()
    self.att = MultiHeadAttention(num_heads = num_heads, key_dim = embed_dim)
    # this creates a MultiHeadAttention layer, responsoble for learning
    self.ffn  = Sequential(
        [Dense(ff_dim, activation = "relu"), Dense(embed_dim),]
    )
    # self.ffn: This creates a feedforward network, often used for add
    self.layernormal1 = LayerNormalization(epsilon=1e-6)
    self.layernormal2 = LayerNormalization(epsilon=1e-6)
    # self.layernormal1 and self.layernormal2: These create
    self.dropout1 = Dropout(rate)
    self.dropout2 = Dropout(rate)
    # self.dropout1 and self.dropout2: These create Dropout layers

  def call(self, inputs, training) :
    attn_output = self.att(inputs, inputs)
    # Applies the nulti-head attention to the input sequence, allowing different
    attn_output = self.dropout1(attn_output, training = training)
    # Applies dropout to the attention output
    out1 = self.layernormal1(inputs + attn_output)
    # add the attention output to the original input and applies layer
    ffn_output = self.ffn(out1)
    # Passes the normalized output through the feedforwrfs network
    ffn_output = self.dropout2(ffn_output,training = training)

    return self.layernormal2(out1+ffn_output)



In [21]:
class TokenAndPositionEmbedding(Layer):
  def __init__(self,maxlen, vocab_size, embed_dim):
    # maxlen: the maximium length of the input sequence the model will
    # vocab size : The total number of unique tokens (words)in the words
    # embed_dim : The dimentionality of the embeddings (how each wors is taken according to the postion)
    super().__init__()
    self.token_emb = Embedding(input_dim = vocab_size, output_dim = embed_dim)
    # An embedding layer that maps each token in the input sequence
    # to a dense vector of size embed_dim
    self.pos_emb = Embedding(input_dim = maxlen, output_dim = embed_dim)
    # An embedding layer that maps each position in the sequence
    # (from 0 to maxlen - 1) to a dense vector of size embed_dim


  def call(self, x):
    maxlen = tf.shape(x)[-1]
    # Extracts the actual length of the current input sequence
    positions = tf.range(start = 0, limit = maxlen, delta = 1)
    # create a tensor of position from 0 to maxlen-1
    positions = self.pos_emb(positions)
    # looks up the position embedding for each position in the sequence
    x = self.token_emb(x)
    # looks up the token embedding for each position in the sequence
    return x + positions
    # adds the token embedding and positioni embeding element-wise
    # resulting in a combined representation that captures boht word
    # meaning and position information

In [22]:
vocab_size=20000 #Only consider top 20k words
maxlen=200 # Only consider the first 200 words of each movie review
(x_train,y_train),(x_val,y_val)=imdb.load_data(num_words=vocab_size)
print(len(x_train),"Training Sequences")
print(len(x_val),"Validation sequences")
x_train=pad_sequences(x_train,maxlen=maxlen)
x_val=pad_sequences(x_val,maxlen=maxlen)

25000 Training Sequences
25000 Validation sequences


In [23]:
x_train.shape

(25000, 200)

In [24]:
embed_dim=32 #Embedding size of each token
num_heads=2 #NUMBER OF attention heads
ff_dim=32 #Hidden layer size in feedforward network inside transformer

inputs=Input(shape=(maxlen,))
embedding_layer=TokenAndPositionEmbedding(maxlen,vocab_size,embed_dim)
x=embedding_layer(inputs)
transformer_block=TransformerBlock(embed_dim,num_heads,ff_dim)
x=transformer_block(x)
x=GlobalAveragePooling1D()(x)
x=Dropout(0.1)(x)
x=Dense(20,activation="relu")(x)
x=Dropout(0.1)(x)
outputs=Dense(2,activation="softmax")(x)

model=Model(inputs=inputs,outputs=outputs)

In [25]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
history=model.fit(x_train,y_train,batch_size=32,epochs=10,validation_data=(x_val,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 200)]             0         
                                                                 
 token_and_position_embeddi  (None, 200, 32)           646400    
 ng_3 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 transformer_block_3 (Trans  (None, 200, 32)           10656     
 formerBlock)                                                    
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_8 (Dropout)         (None, 32)                0     