In [15]:
import numpy as np
class MultiHeadAttention:
    def __init__(self,num_layers, d,h):
        self.d = d
        self.h = h
        self.d_k = d/ h
        self.v_weights, self.k_weights, self.q_weights, self.lin_trans = (np.random.randn(4,d,d))
        self.bias_weights1 = np.zeros(d)
        self.bias_weights1 = np.zeros(d)



    def split_heads(self, x, batch_size):
        x = x.reshape(batch_size,-1,self.h,self.d_k)
        return x.transpose(0,2,1,3)
    def scaled_self_attention(self,queries,keys,values):
        attention_scores = np.matmul(queries, keys.transpose(0,2,1)) / np.sqrt(self.d_k)
        attention_weights = softmax(attention_scores)
        attention_outputs = np.matmul(attention_weights, values)
        return attention_output

    def forward(self, x):
        self.x = x
        batch_size, seq_len,d = x.shape

        values = np.dot(x, self.v_weights)
        keys = np.dot(x, self.k_weights)
        queries = np.dot(x, self.q_weights)

        values = self.split_heads(values, batch_size)
        keys = self.split_heads(keys, batch_size)
        queries = self.split_heads(queries, batch_size)

        attention_output=self.scaled_self_attention(self,queries,keys,values)
        attention_output = attention_outputs.reshape(batch_size,-1)
        normalized_output = self.layer_norm(x + attention_output)
        ffn_output = np.dot(normalized_output, self.lin_trans) + self.bias_weights1
        ffn_output = np.maximum(0, ffn_output)
        output = np.dot(ffn_output, self.lin_trans.T) + self.bias_weights2

        return output


In [17]:
    def backward(self, grad_output):
        grad_bias2 = np.sum(grad_output, axis=0)
        grad_ffn_output = np.dot(grad_output, self.lin_trans.T)
        grad_ffn_output[ffn_output <= 0] = 0
        grad_lin_trans = np.dot(attention_output.T, grad_ffn_output)
        grad_bias1 = np.sum(grad_ffn_output, axis=0)

        grad_attention_output = np.dot(grad_ffn_output, self.lin_trans.T)
        grad_attention_output = grad_attention_output.reshape(batch_size, self.h, -1, self.d_k)

        grad_attention_weights = np.matmul(grad_attention_output, values.transpose(0, 2, 1))
        grad_values = np.matmul(grad_attention_output.transpose(0, 2, 1, 3), attention_weights)
        grad_keys = np.matmul(queries.transpose(0, 2, 1, 3).transpose(1, 2, 0, 3), grad_attention_output)

        grad_values = grad_values.reshape(batch_size, -1, self.d)
        grad_keys = grad_keys.reshape(batch_size, -1, self.d)
        grad_queries = np.matmul(grad_attention_weights, keys)

        grad_v_weights = np.dot(x.T, grad_values)
        grad_k_weights = np.dot(x.T, grad_keys)
        grad_q_weights = np.dot(x.T, grad_queries)

        self.v_weights -= self.learning_rate * grad_v_weights
        self.k_weights -= self.learning_rate * grad_k_weights
        self.q_weights -= self.learning_rate * grad_q_weights
        self.lin_trans -= self.learning_rate * grad_lin_trans
        self.bias_weights1 -= self.learning_rate * grad_bias1
        self.bias_weights2 -= self.learning_rate * grad_bias2

        return grad_values

In [19]:
num_layers = 6 #cloning these layers for N=6 repeating the same layers again
d = 512
h = 8
MultiHeadAttention(num_layers,d,h)

<__main__.MultiHeadAttention at 0x7a9d8119f160>

In [20]:
class LayerNormalization:
    def __init__(self, epsilon=1e-5):
        self.epsilon = epsilon
        self.gamma = np.ones(d)  # Scaling parameter
        self.beta = np.zeros(d)   # Shifting parameter

    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        variance = np.var(x, axis=-1, keepdims=True)
        normalized_x = (x - mean) / np.sqrt(variance + self.epsilon)
        out = self.gamma * normalized_x + self.beta
        return out

    def backward(self, grad_output):
        grad_normalized_x = grad_output * self.gamma
        grad_variance = np.sum(grad_normalized_x * (self.x - np.mean(self.x, axis=-1, keepdims=True)), axis=-1, keepdims=True) * -0.5 * ((self.variance + self.epsilon) ** -1.5)
        grad_mean = np.sum(grad_normalized_x * -1 / np.sqrt(self.variance + self.epsilon), axis=-1, keepdims=True) + grad_variance * np.mean(self.x, axis=-1, keepdims=True) * -2 / d
        grad_input = grad_normalized_x / np.sqrt(self.variance + self.epsilon) + grad_variance * 2 * (self.x - np.mean(self.x, axis=-1, keepdims=True)) / d + grad_mean / d
        grad_gamma = np.sum(grad_output * (self.x - np.mean(self.x, axis=-1, keepdims=True)) / np.sqrt(self.variance + self.epsilon), axis=-1)
        grad_beta = np.sum(grad_output, axis=-1)
        self.gamma -= self.learning_rate * grad_gamma
        self.beta -= self.learning_rate * grad_beta
        return grad_input

class MultiHeadAttention:
    def __init__(self, num_layers, d, h):
        self.d = d
        self.h = h
        self.d_k = d // h
        self.v_weights, self.k_weights, self.q_weights, self.lin_trans = (np.random.randn(4, d, d))
        self.bias_weights1, self.bias_weights2 = (np.zeros(d), np.zeros(d))
        self.layers = [LayerNormalization() for _ in range(num_layers)]