SELF ATTENTION MECHANISM(WITHOUT TRAINABLE WEIGHTS)

In [1]:
import torch
import torch.nn.functional as f
import math
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [None]:
# Initalizing the attention weights
attention_weights = torch.matmul(inputs, inputs.T)
attention_weights = f.softmax(attention_weights)
# Context Vectors 
context_vectors = torch.matmul(attention_weights, inputs)
context_vectors

SELF ATTENTION MECHANISM WITH TRAINABLE WEIGHTS

In [None]:
dim_of_weight_matrix = inputs.shape[-1]

# Initialize weights and queries of the query, value and key matrices
torch.manual_seed(123)
weights_query = torch.nn.Parameter(torch.randn(dim_of_weight_matrix, dim_of_weight_matrix)) 
# We can use nn.Linear instead of nn.parameter because they can add bias and good sophisicated weight inititalisation scheme
weights_value = torch.nn.Parameter(torch.randn(dim_of_weight_matrix, dim_of_weight_matrix))
weights_key = torch.nn.Parameter(torch.randn(dim_of_weight_matrix, dim_of_weight_matrix))

# Initialize queries, values and keys
query = torch.matmul(inputs,weights_query)
key = torch.matmul(inputs,weights_key)
value = torch.matmul(inputs,weights_value)

# Calculate the attention scores
attention_scores = torch.matmul(query,key.T)
attention_scores1 = attention_scores/key.shape[-1]**0.5
attention_scores1 = f.softmax(attention_scores1)

# Calculate the context vectors
context_vectors = torch.matmul(attention_scores1,key)
context_vectors

MASKED SELF ATTENTION WITH DROPOUT

In [None]:
dim_of_weight_matrix = inputs.shape[-1]
print(dim_of_weight_matrix)
torch.seed()
weights_query = torch.nn.Linear(dim_of_weight_matrix, dim_of_weight_matrix,bias = False)
weights_value = torch.nn.Linear(dim_of_weight_matrix, dim_of_weight_matrix,bias = False)
weights_key = torch.nn.Linear(dim_of_weight_matrix, dim_of_weight_matrix,bias = False)
query = weights_query(inputs)
key = weights_key(inputs)
value = weights_value(inputs)
attention_scores = torch.matmul(query,key.T)
# print(attention_scores)
mask = torch.triu(torch.ones(attention_scores.shape[0], attention_scores.shape[0]),diagonal=1) ==1
# print(mask)
result =  attention_scores.masked_fill(mask==True, -torch.inf)
# print(result)
attn_weights = torch.softmax(result / key.shape[-1]**0.5, dim=1)
# print(attn_weights)
dropout = torch.nn.Dropout(0.5)
attn_weights = dropout(attn_weights)
# print(attn_weights)
context_vectors = torch.matmul(attn_weights,key)
print(context_vectors)

MULTI-HEAD ATTENTION

In [None]:
inputs = torch.tensor(
   [[[0.43, 0.15, 0.89, 0.55, 0.87, 0.66],  # Row 1
     [0.57, 0.85, 0.64, 0.22, 0.58, 0.33],  # Row 2
     [0.77, 0.25, 0.10, 0.05, 0.80, 0.55]] , # Row 3
     [[0.10, 0.55, 0.87, 0.66, 0.43, 0.15],  # Row 4
     [0.22, 0.58, 0.33, 0.05, 0.85, 0.85],  # Row 5
     [0.10, 0.55, 0.8, 0.05, 0.85, 0.85]]] # Row 6
)

print(inputs.shape)

dim_of_weight_matrix = inputs.shape[-1]
num_heads = 3
head_dim =  int(dim_of_weight_matrix/num_heads)
weights_query = torch.nn.Linear(dim_of_weight_matrix, dim_of_weight_matrix,bias = False)
weights_value = torch.nn.Linear(dim_of_weight_matrix, dim_of_weight_matrix,bias = False)
weights_key = torch.nn.Linear(dim_of_weight_matrix, dim_of_weight_matrix,bias = False)
query = weights_query(inputs)
key = weights_key(inputs)
value = weights_value(inputs)
query =  torch.reshape(query,(inputs.shape[0], inputs.shape[1], num_heads,head_dim))
key =  torch.reshape(key,(inputs.shape[0], inputs.shape[1], num_heads,head_dim))
value = torch.reshape(value,(inputs.shape[0], inputs.shape[1], num_heads,head_dim))
query = query.transpose(1,2)
key = key.transpose(1,2)
value = value.transpose(1,2)
multi_head_attention_scores = torch.matmul(query, key.transpose(2,3))
mask = torch.triu(torch.ones(multi_head_attention_scores.shape),diagonal=1) ==1
result =  multi_head_attention_scores.masked_fill(mask==True, -torch.inf)
multi_head_attn_weights = torch.softmax(result / key.shape[-1]**0.5, dim=-1)
dropout = torch.nn.Dropout(0.5)
multi_head_attn_weights = dropout(multi_head_attn_weights)

print(multi_head_attn_weights.shape)
multi_head_context_vectors = torch.matmul(multi_head_attn_weights,value).transpose(1,2)
print(multi_head_context_vectors.shape)
multi_head_context_vectors = torch.reshape(multi_head_context_vectors,(inputs.shape[0],inputs.shape[1],dim_of_weight_matrix))
print(multi_head_context_vectors.shape)


IMPLEMENTING A MULTI HEAD ATTENTION CLASS

In [2]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, dim_model, num_heads, bias, model_dropout):
        super().__init__()
        self.dim_model = dim_model
        self.bias = bias
        self.num_heads = num_heads
        self.head_dim = dim_model // num_heads
        self.weights_query = torch.nn.Linear(self.dim_model, self.dim_model, bias=self.bias)
        self.weights_value = torch.nn.Linear(self.dim_model, self.dim_model, bias=self.bias)
        self.weights_key = torch.nn.Linear(self.dim_model, self.dim_model, bias=self.bias)
        self.dropout = torch.nn.Dropout(model_dropout)
    def forward(self,inputs):
        query = self.weights_query(inputs)
        key = self.weights_key(inputs)
        value = self.weights_value(inputs)
        query =  torch.reshape(query,(inputs.shape[0], inputs.shape[1], self.num_heads,self.head_dim))
        key =  torch.reshape(key,(inputs.shape[0], inputs.shape[1], self.num_heads,self.head_dim))
        value = torch.reshape(value,(inputs.shape[0], inputs.shape[1], self.num_heads,self.head_dim))
        query = query.transpose(1,2)
        key = key.transpose(1,2)
        value = value.transpose(1,2)
        multi_head_attention_scores = torch.matmul(query, key.transpose(2,3))
        mask = torch.triu(torch.ones(multi_head_attention_scores.shape),diagonal=1) ==1
        result =  multi_head_attention_scores.masked_fill(mask==True, -torch.inf)
        multi_head_attn_weights = torch.softmax(result / key.shape[-1]**0.5, dim=-1)
        dropout = torch.nn.Dropout(0.5)
        multi_head_attn_weights = dropout(multi_head_attn_weights)
        multi_head_context_vectors = torch.matmul(multi_head_attn_weights,value).transpose(1,2)
        multi_head_context_vectors = torch.reshape(multi_head_context_vectors,(inputs.shape[0],inputs.shape[1],self.dim_model))
        return multi_head_context_vectors

In [None]:
inputs = torch.tensor(
   [[[0.43, 0.15, 0.89, 0.55, 0.87, 0.66],  # Row 1
     [0.57, 0.85, 0.64, 0.22, 0.58, 0.33],  # Row 2
     [0.77, 0.25, 0.10, 0.05, 0.80, 0.55]] , # Row 3
     [[0.10, 0.55, 0.87, 0.66, 0.43, 0.15],  # Row 4
     [0.22, 0.58, 0.33, 0.05, 0.85, 0.85],  # Row 5
     [0.10, 0.55, 0.8, 0.05, 0.85, 0.85]]] # Row 6
)
multi_head = MultiHeadAttention(inputs.shape[-1],3,False,0.1)
result = multi_head.forward(inputs)
result.shape

LAYER NORMALISATION GELU ACTIVATION AND FEED FORWARD NEURAL NETWORK

In [None]:
tensor = torch.tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]])
mean = tensor.mean( dim =-1, keepdim=True)
var = tensor.var( dim =-1, keepdim=True)

normalized_tensor = (tensor - mean) / torch.sqrt(var + 1e-5)
print(mean, var)
print(normalized_tensor)


In [None]:
class layer_normalisation(torch.nn.Module):
    def __init__(self, dim_model):
        super().__init__()
        self.scale = torch.nn.Parameter(torch.ones(dim_model))
        self.shift = torch.nn.Parameter(torch.zeros(dim_model))
        self.eps = 1e-5
    def forward(self, inputs):
        mean = inputs.mean( dim =-1, keepdim=True)
        var = inputs.var( dim =-1, keepdim=True)
        normalized_inputs = (inputs - mean) / torch.sqrt(var + self.eps)
        normalized_inputs = self.scale * normalized_inputs + self.shift
        return normalized_inputs

: 

In [None]:
tensor = torch.tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]])
layer_norm = layer_normalisation(6)
out_ln = layer_norm.forward(tensor)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print(mean, var)


In [None]:
class GeLU(torch.nn.Module):
    def __init__(self):
     super().__init__()

    def forward(self, inputs):
       return 0.5 * inputs * (1 + torch.tanh(torch.sqrt(2 / math.pi) * (inputs + 0.044715 * torch.pow(inputs, 3))))

In [None]:
class feed_forward(torch.nn.Module):
    def __init__(self,dim_model):
        super().__init__()
        self.layer = torch.nn.Sequential(torch.nn.Linear(dim_model,4*dim_model),GeLU(),
                                         torch.nn.Linear(4*dim_model,dim_model))
    def forward(self, inputs):
        return self.layer(inputs)

Coding Transformer Block

In [None]:
class Transformer(torch.nn.Module):
    def __init__(self, dim_model, num_heads, model_dropout):
        super().__init__()
        self.layer_norm = layer_normalisation(dim_model)
        self.layer_norm2 = layer_normalisation(dim_model)
        self.attention = MultiHeadAttention(dim_model, num_heads, False, model_dropout)
        self.feed_forward = feed_forward(dim_model)
        self.dropout = torch.nn.Dropout(model_dropout)
    def forward(self, inputs):
        shortcut_connection = inputs
        layer_norm_output = self.layer_norm(inputs)
        attention_output = self.attention(layer_norm_output)
        attention_output = self.dropout(attention_output)
        attention_output += shortcut_connection
        shortcut_connection = attention_output
        layer_norm_output2 = self.layer_norm2(attention_output)
        feed_forward_output = self.feed_forward(layer_norm_output2)
        feed_forward_output = self.dropout(feed_forward_output)
        feed_forward_output += shortcut_connection
        return feed_forward_output


GPT ARCHITECTURE -2

In [None]:
class gpt2_architecture(torch.nn.Module):
    def __init__(self,config):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(config["vocab_size"],config["emb_dim"])
        self.positional_embedding = torch.nn.Embedding(config['context_length'],config["emb_dim"])
        self.droput = torch.nn.Dropout(config['drop_rate'])
        self.transformer = torch.nn.Sequential(*[Transformer(config['emb_dim'],config['n_heads'],config['drop_rate']) for i in range(config['n_layers'])])
        self.final_layer_norm = layer_normalisation(config['emb_dim'])
        self.output_layer = torch.nn.Linear(config["emb_dim"],config["vocab_size"])
    
    def forward(self, inputs):
        x = self.token_embedding(inputs)
        x = x + self.positional_embedding(x)
        x = self.droput(x)
        x = self.transformer(x)
        x = self.final_layer_norm(x)
        x = self.output_layer(x)
        return x


In [None]:

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}
inputs = torch.tensor([[6109, 3626, 6100,  345]])
torch.manual_seed(123)
model = gpt2_architecture(GPT_CONFIG_124M)
out = model(inputs)

In [None]:
inputs = torch.tensor([[6109, 3626, 6100,  345]])
print(inputs)
