##This colab demonstrates the implementation of transformer using attention mechanism of text classification task using keras.

Dataset used : IMDB dataset. 

**Attention mechanism:**

* Used especially in Natural language classification and text classification problems
* It is an improvement to the encoder-decoder setup
* It means to prominantly focus on those words or lemma that need attention by ignoring the rest of the sections of the text.

* It looks at an input sequence and decides at each of the steps which other parts of the sequence are important.

**Transformer**

* Attention is applied in transformers
* There is an encoder and a decoder setup. 
* The Encoder also writes down keywords that are important to the semantics of the sentence, and gives them to the Decoder in addition to the regular translation.


# Import libraries

In [None]:
# Import required libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Implement Multi Head Self Attention 

In [None]:
class MultiHead_SelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_head=8):
        super(MultiHead_SelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_head = num_head
        if embed_dim % num_head != 0:
            raise ValueError(f"embedding dimension = {embed_dim} should be divisible by number of head = {num_head}")
        self.projection_dim = embed_dim // num_head
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_head = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_head(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_head, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        # (batch_size, seq_len, embed_dim)
        query = self.query_dense(inputs) 
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        query = self.separate_head(query, batch_size)
        key = self.separate_head(key, batch_size)
        value = self.separate_head(value, batch_size)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concatinate_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        output = self.combine_head(concatinate_attention)
        return output



In [None]:
class Transformer_Block(layers.Layer):
    def __init__(self, embed_dim, num_head, ff_dim, rate=0.1):
        super(Transformer_Block, self).__init__()
        self.att = MultiHead_SelfAttention(embed_dim, num_head)
        self.ffn = keras.Sequential([layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),])
        self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout_1 = layers.Dropout(rate)
        self.dropout_2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout_1(attn_output, training=training)
        out1 = self.layer_norm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout_2(ffn_output, training=training)
        return self.layer_norm2(out1 + ffn_output)



In [None]:

class TokenAndPosition_Embedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPosition_Embedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        position = tf.range(start=0, limit=maxlen, delta=1)
        position = self.pos_emb(position)
        x = self.token_emb(x)
        return x + position



In [None]:
vocab_size = 20000
max_len = 200
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=max_len)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 Training sequences
25000 Validation sequences


In [None]:
# Define parameters
embed_dim = 32 
num_head = 2
feed_forward_dim = 32

inputs = layers.Input(shape=(max_len,))
embedding_layer = TokenAndPosition_Embedding(max_len, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = Transformer_Block(embed_dim, num_head, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
output = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=output)

In [None]:
# Check tensorflow version
print (tf.__version__)

2.3.0


In [None]:
# Initialize Tensorboard for visualization
%reload_ext tensorboard
%tensorboard --logdir logs

In [None]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
)


Epoch 1/2
Epoch 2/2
