In [1]:
! pip install tensorflow numpy

Collecting tensorflow
  Using cached tensorflow-2.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting numpy
  Downloading numpy-2.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from te

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer

<h2>Creating Custom SelfAttention Layer</h2>

In [4]:
class SelfAttention(Layer):
    def __init__(self, dim):
        super(SelfAttention, self).__init__()
        self.embed_dim = dim
        self.query = tf.keras.layers.Dense(dim)
        self.key = tf.keras.layers.Dense(dim)
        self.value = tf.keras.layers.Dense(dim)
        self.softmax = tf.keras.layers.Softmax(axis=-1)

    def call(self, inputs):
        Q = self.query(inputs)
        K = self.key(inputs)
        V = self.value(inputs)

        qk = tf.matmul(Q, K, transpose_b=True)
        sqrt = tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
        inner = qk/sqrt

        attention_weight = self.softmax(inner)
        result = tf.matmul(attention_weight, V)

        return result

In [7]:
x = tf.random.normal((2, 5, 64))
attention = SelfAttention(dim = 64)
attention_val = attention(x)
attention_val.shape

TensorShape([2, 5, 64])

<h2>Using predefined Multihead Attention layer</h2>

In [10]:
from tensorflow.keras.layers import MultiHeadAttention, Input
from tensorflow.keras.models import Model

In [11]:
input_tensor = Input(shape=(5,64))

In [14]:
multihead_output = MultiHeadAttention(num_heads=4, key_dim=16)(input_tensor, input_tensor)

In [15]:
model = Model(inputs=input_tensor, outputs=multihead_output)

In [16]:
model.summary()