<a href="https://colab.research.google.com/github/OptimoCX/BootCampIA/blob/main/SelfAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Self Attention

In [None]:
import numpy as np

# Example input embeddings
X = np.random.rand(10, 16)  # 10 elements, each is a 16-dimensional vector

# Initialize weight matrices for queries, keys, and values
W_Q = np.random.rand(16, 16)  # Dimensions chosen for example purposes
W_K = np.random.rand(16, 16)
W_V = np.random.rand(16, 16)

# Compute queries, keys, and values
Q = np.dot(X, W_Q)
K = np.dot(X, W_K)
V = np.dot(X, W_V)

In [None]:
dot_product = np.dot(Q, K.T) ## Using numpy's inbuilt functions

In [None]:
d_k = K.shape[-1] ## This will be 16 in this case

## Basically dividing by square root of 16, i.e. 4 in this case
scaled_dot_product = dot_product/(np.sqrt(d_k))

In [None]:
def softmax(z):
    exp_scores = np.exp(z) # This is e^z where e is a math constant
    probabilities = exp_scores / np.sum(exp_scores)
    return probabilities

# Example usage
vector_array = [2.0, 1.0, 0.1]
print("Softmax probabilities:", softmax(vector_array))

Softmax probabilities: [0.65900114 0.24243297 0.09856589]


In [None]:
attention_weights = softmax(scaled_dot_product)

In [None]:
output = np.dot(attention_weights, V)

In [None]:
import numpy as np

def softmax(z):
    exp_scores = np.exp(z - np.max(z, axis=-1, keepdims=True))  # Improve stability
    return exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

def scaled_dot_product_attention(X, W_Q, W_K, W_V):
    # Compute queries, keys, and values
    Q = np.dot(X, W_Q)
    K = np.dot(X, W_K)
    V = np.dot(X, W_V)

    # Calculate dot products of Q and K^T
    dot_product = np.dot(Q, K.T)

    # Get the scaled dot product
    d_k = K.shape[-1]
    scaled_dot_product = dot_product / np.sqrt(d_k)

    # Apply softmax to get attention weights
    attention_weights = softmax(scaled_dot_product)

    # Multiply by V to get the output
    output = np.dot(attention_weights, V)

    return output, attention_weights

# Example usage
# Define input embeddings and weight matrices
X = np.random.rand(10, 16)  # 10 elements, each is a 16-dimensional vector
W_Q = np.random.rand(16, 16)
W_K = np.random.rand(16, 16)
W_V = np.random.rand(16, 16)

output, attention_weights = scaled_dot_product_attention(X, W_Q, W_K, W_V)
print("Output (Aggregated Embeddings):")
print(output)
print("\nAttention Weights (Relevance Scores):")
print(attention_weights)

Output (Aggregated Embeddings):
[[3.3677096  3.6852172  4.79113694 4.82288165 5.13289339 3.94665865
  4.46339435 4.70454487 4.40363619 4.65819836 4.85126591 4.69564496
  4.55442404 3.56933867 3.72986483 5.64606118]
 [3.38371585 3.68209679 4.81492899 4.81293965 5.15684909 3.93228679
  4.41763822 4.68938107 4.34992614 4.66398164 4.88760559 4.68324999
  4.52799852 3.57133069 3.72651261 5.64460401]
 [3.37409348 3.68227263 4.80078617 4.81284881 5.14556413 3.93939691
  4.43384787 4.70132666 4.36869097 4.65781379 4.87358213 4.68261455
  4.53729891 3.56846171 3.72339583 5.63957328]
 [3.36475144 3.67277003 4.78299272 4.76740498 5.15425489 3.93213503
  4.35714162 4.72709097 4.29663221 4.63202444 4.90614167 4.61601542
  4.49886452 3.55549629 3.68310096 5.59344439]
 [3.37946226 3.68375036 4.80849185 4.81839287 5.14898156 3.9367978
  4.43500181 4.69205843 4.37053563 4.66368594 4.87422742 4.69026357
  4.53807824 3.57159065 3.72980124 5.64774487]
 [3.36606016 3.68505265 4.78843777 4.82221665 5.131426