<a href="https://colab.research.google.com/github/Ricardo711/LLM/blob/master/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#lets compute wij
import torch
import torch.nn.functional as F
#input_id tokenizer
sentence=torch.tensor([0,#can
                       7,#you
                       1,#help
                       2,#me
                       5,#to
                       6,#translate
                       4,#this
                       3]#sentence
                      )
#encoded this sentence into a real-number vector representation via embedding
torch.manual_seed(123)
embed=torch.nn.Embedding(num_embeddings=10,embedding_dim=16)
embedded_sentence=embed(sentence).detach()#detach means require_grad=False
print(embedded_sentence.shape)

#now we can compute wij as the dot product between the ith and jth word embeddings
omega=torch.empty(8,8)
for i,x_i in enumerate(embedded_sentence):
  for j,x_j in enumerate(embedded_sentence):
    omega[i,j]=torch.dot(x_i,x_j)
#################################################################3
#Step1: compute dot products
################################################################33
#2 for loops are ineficcient. using matmul to matrix multiplication
omega_mat=embedded_sentence.matmul(embedded_sentence.T)#X*XT
print(omega_mat)
#omega_mat[0,5] = -1.6717 # similarity between "can" and "translate"
#omega_mat[2,1] = 9.0642  # similarity between "help" and "you"

#we can use torch.allclose to check if both are the same
torch.allclose(omega,omega_mat)

#we have compute the similarity-based weights for the ith input and all inputs in the sequence
#we can obtain the attention weights aij by normalizing the wij via softmax function
####################################################################################
#step 2: normalize using softmax
####################################################################################3
attention_weights=F.softmax(omega,dim=1)
print(attention_weights.shape)
print(attention_weights)
# Row 0: The 0th token mostly attends to itself (0.9927).
# Very little attention is given to:
# - 1st token: 0.000324
# - 2nd token: 0.00665
# - 3rd token: 0.0000147
# - 4th token: 0.0000921
# - 5th token: 0.0000108
# - 6th token: 0.000159
# - 7th token: 0.0000504

#these attention weights indicate how relevant each word is to the ith word.
#columns must sum1
attention_weights.sum(dim=1)

#########################################################################################
#step3: compute output context aware embedding vector
#######################################################################################
context_vectors=torch.matmul(attention_weights,embedded_sentence)
print(context_vectors.shape)
print(context_vectors)

torch.Size([8, 16])
tensor([[ 9.7601,  1.7326,  4.7543, -1.3587,  0.4752, -1.6717,  1.0227, -0.1286],
        [ 1.7326, 16.0787,  9.0642, -0.3370,  1.1368,  1.1972,  1.6485, -1.2789],
        [ 4.7543,  9.0642, 22.6615, -0.8519,  7.7799,  2.7483, -0.6832,  1.6236],
        [-1.3587, -0.3370, -0.8519, 13.9473, -1.4198, 10.9659, -0.5887,  2.3869],
        [ 0.4752,  1.1368,  7.7799, -1.4198, 13.7511, -6.8568, -2.5114, -3.3468],
        [-1.6717,  1.1972,  2.7483, 10.9659, -6.8568, 24.6738, -3.8294,  4.9581],
        [ 1.0227,  1.6485, -0.6832, -0.5887, -2.5114, -3.8294, 15.8691,  2.0269],
        [-0.1286, -1.2789,  1.6236,  2.3869, -3.3468,  4.9581,  2.0269, 18.7382]])
torch.Size([8, 8])
tensor([[9.9270e-01, 3.2398e-04, 6.6502e-03, 1.4723e-05, 9.2135e-05, 1.0766e-05,
         1.5929e-04, 5.0374e-05],
        [5.8773e-07, 9.9910e-01, 8.9788e-04, 7.4187e-08, 3.2391e-07, 3.4407e-07,
         5.4033e-07, 2.8926e-08],
        [1.6712e-08, 1.2438e-06, 1.0000e+00, 6.1412e-11, 3.4437e-07, 2.248

#Parameterizing the self-attention mechanism: scaled dot product attention

In [2]:
#3 weight matrices Uq,Uk,Uv. qi=Uqxi,ki=Ukxi,vi=Uvxi. Uq and Uk have shape dk x d, while Uv dv x d (d is embedding dimmesnion). For simplicity dk=dv=d
torch.manual_seed(123)
d=embedded_sentence.shape[1]
U_query=torch.rand(d,d)#uniform distribution [0,1)
U_key=torch.rand(d,d)
U_value=torch.rand(d,d)
#we will use the embedding representation of token 2 (you)
x_2=embedded_sentence[1]
query_2=U_query.matmul(x_2)
key_2=U_key.matmul(x_2)
value_2=U_value.matmul(x_2)
#we also need the key and value sequences for all other inputs
keys=U_key.matmul(embedded_sentence.T).T
values=U_value.matmul(embedded_sentence.T).T
#confirm everything is correct
print(torch.allclose(keys[1],key_2))
print(torch.allclose(values[1],value_2))

################################################
#step1: compute wij=qiT*kj
###############################################
omega_2=query_2.matmul(keys.T)
print(omega_2)

#################################################
#step2" normalized attention weights using softmax and scaling factor
##################################################
attention_weights_2=F.softmax(omega_2/d**0.5,dim=0)
print(attention_weights_2)

#######################################################
#step3: weighted average of value sequence
#############################################33333
context_vector_2=attention_weights_2.matmul(values)
print(context_vector_2)

True
True
tensor([-25.1623,   9.3602,  14.3667,  32.1482,  53.8976,  46.6626,  -1.2131,
        -32.9392])
tensor([2.2317e-09, 1.2499e-05, 4.3696e-05, 3.7242e-03, 8.5596e-01, 1.4026e-01,
        8.8897e-07, 3.1935e-10])
tensor([-1.2226, -3.4387, -4.3928, -5.2125, -1.1249, -3.3041, -1.4316, -3.2765,
        -2.5114, -2.6105, -1.5793, -2.8433, -2.4142, -0.3998, -1.9917, -3.3499])


#original transformer


In [19]:
#encoder take input xi,...xn and map an output sequence representation z1,...zn
#multihead attention we use Uq,Uv,Uk, the set of that 3 are called attention head
#given input as each element embedded of size d, we can a set of matrix Uq1,UK1,uV1,Uq2,Uk2,Uv2....Uqh,Ukh,Uvh
#both Uqj and Ukj have shape dk x d and Uvj has dimension dv x d. For simplicity dk=dv=d
torch.manual_seed(123)
d=embedded_sentence.shape[1]
one_Q_query=torch.rand(d,d)
#suposse we have 8 attention heads
h=8
multihead_U_query=torch.rand(h,d,d)
multihead_U_key=torch.rand(h,d,d)
multihead_U_value=torch.rand(h,d,d)
print(multihead_U_key.shape)

#################################################3
#step1 compute qij=Uqjxi
##################################################
#we will need to use .reapeat() because there are 8 attention heads
stacked_inputs=embedded_sentence.T.repeat(8,1,1)
print(stacked_inputs.shape)
#batch multiplication via torch.bmm() to compute all keys
multihead_keys=torch.bmm(multihead_U_key,stacked_inputs)
print(multihead_keys.shape) #[no.attention heads,embedding dimension,no.words]
#swapping 2nd and 3rd dimension
multihead_keys=multihead_keys.permute(0,2,1)
print(multihead_keys.shape)

#for values
multihead_values=torch.matmul(multihead_U_value,stacked_inputs)
multihead_values=multihead_values.permute(0,2,1)
print(multihead_values.shape)

#######################################################3
#calculate context vectors
#####################################################
import math
#step 1
multihead_queries = torch.bmm(multihead_U_query, stacked_inputs)  # shape: [h, d, T]
multihead_queries = multihead_queries.permute(0, 2, 1)
print(multihead_queries.shape)

# Step 2: compute dot product QK^T
# multihead_queries: [h, T, d]
# multihead_keys:    [h, T, d]
# we need to compute: Q @ K.T => [h, T, T]
scores = torch.bmm(multihead_queries, multihead_keys.transpose(1, 2))

# Step 3: scale the scores
dk = d  # assuming dk = d
scaled_scores = scores / math.sqrt(dk)

# Step 4: apply softmax to get attention weights
attention_weights = torch.softmax(scaled_scores, dim=-1)

# Step 5: multiply with values to get context vectors
# multihead_values: [h, T, d]
context_vectors = torch.bmm(attention_weights, multihead_values)  # shape: [h, T, d]
print(context_vectors.shape)
print(context_vectors[0][0]) #The context vector of the first token (index 0) computed by the first attention head (head 0).
#Interpreting the dimensions:
#First dimension (8): Number of attention heads.
#Second dimension (8): Number of tokens (words) in the input sentence.
#Third dimension (16): Context vector dimension (same as your embedding dim d=16).

#context_vectors[head][token] is a vector of size 16 that represents what
#that attention head thinks is important about the token at that position in the sentence — after looking at all other tokens.


#################################################################3
#linear projection
#################################################################
linear=torch.nn.Linear(8*8*16,16)
context_vector=linear(context_vectors.flatten())
print(context_vector.shape)

torch.Size([8, 16, 16])
torch.Size([8, 16, 8])
torch.Size([8, 16, 8])
torch.Size([8, 8, 16])
torch.Size([8, 8, 16])
torch.Size([8, 8, 16])
torch.Size([8, 8, 16])
tensor([ 4.5022,  3.0754,  3.6300,  0.7366,  0.7970,  2.5551,  4.0832,  1.2459,
         0.8504,  1.7613,  0.7076,  2.1336,  0.5397, -0.0704,  1.2788,  2.4048])
torch.Size([16])
