In [1]:
import tensorflow as tf
from keras.models import *
from keras.layers import *
from keras.datasets import imdb
from keras.utils import pad_sequences

2023-12-22 08:49:09.861185: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-22 08:49:09.865055: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-22 08:49:09.921958: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-22 08:49:09.921996: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-22 08:49:09.923385: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
class TransformersBlock(Layer):
    def __init__(self, embed_dim,num_heads, ff_dim, rate=0.1):
         #embed_dim: This paramater specifies the dimensionality of the input and output
         #num_heads: This parameter controls the number of attention heads
         # ff_dim: This parameter specifies the dimensionality of the feedforward network
         # rate: This parameter controls the dropout rate, which is used to

         super().__init__()
         self.att = MultiHeadAttention(num_heads, key_dim=embed_dim)
         # This creates a MultiHeadAttention layer, responsible for learning
         self.ffn = Sequential([Dense(ff_dim, activation="relu"),Dense(embed_dim),]
         )
         # self.ffn: This creates a feedforward network, often used for
         self.layernorm1 = LayerNormalization(epsilon=1e-6)
         self.layernorm2 = LayerNormalization(epsilon=1e-6)
         # self.layernorm1 and self.layernorm2: These create LayerNormalization
         self.dropout1 = Dropout(rate)
         self.dropout2 = Dropout(rate)
         # self.dropout1 and self.dropout2: Thse create Dropout layers, rate

    def call(self, inputs, training):
        attn_output =self.att(inputs,inputs)
        # Applies multi-head attention to the input sequence, allowing
        attn_output = self.dropout1(attn_output, training=training)
        # Applies dropout to the attention output
        out1 = self.layernorm1(inputs + attn_output)
        # Adds the attention output to th eoriginal input and applies layer
        ffn_output = self.ffn(out1)
        # Passes the normalized output through the feedforward network
        ffn_output = self.dropout2(attn_output, training=training)
        # Applies dropout to the feedforward output
        return self.layernorm2(out1 + ffn_output)
        # Adds the feedforward output to the previous layer's output and a


In [3]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        # maxlen: The maximum length of the input sequences the model will
        # vocab_size: The total number of unique tokens (words) in the vocabulary
        # embed_dim: the dimensionality of the embeddings (how each token)
        super().__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        # An Embedding layer that maps each token on the input sequence
        # to a dense vector of size embed_dim
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
        #An Embedding layer that maps each position in the sequence
        # (from 0 to maxlen-1) to a dense vector of size embed_dim.
    def call(self, x):
        maxlen = tf.shape(x)[-1]
        # Extracts the actual length of the current input sequence.
        positions = tf.range(start=0, limit=maxlen, delta=1)
        # Create a tensor of positions from 0 to maxlen-1.
        positions = self.pos_emb(positions)
        #Looks up the position embeddings for each position in that sequence
        x=self.token_emb(x)
        # Looks up the token embeddings for each token in the input sequence
        return x + positions
        # Adds the token embeddings and position embeddings element-wise,resulting in A COMBINED REPRESENTATION THAT CAPTURES both word meanng and positional information.

In [4]:
vocab_size=20000  # Only consider the top 20k words
maxlen = 200 # Only consider the first 200 words of each movie review
(x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=vocab_size)
print(len(x_train),"Training sequences")
print(len(x_val), "Validation sequences")
x_train = pad_sequences(x_train, maxlen=maxlen)
x_val = pad_sequences(x_val, maxlen=maxlen)

25000 Training sequences
25000 Validation sequences


In [5]:
x_train.shape , x_val.shape

((25000, 200), (25000, 200))

In [6]:
embed_dim = 32    #Embedding size for each token
num_heads = 2    # Number of attention heads
ff_dim = 32       # Hidden layer size in feed forword network inside transformer

inputs = Input(shape = (maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformersBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20,activation='relu')(x)
x = Dropout(0.1)(x)

outputs = Dense(2, activation='softmax')(x)

model = Model(inputs=inputs,outputs=outputs)

In [7]:
# Compile and train the model 

model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
                    batch_size=32,epochs=10,
                    validation_data=[x_val,y_val])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
