In [4]:
import numpy as np

# Sample tokenized input (7 words)
tokens = ["What", "are", "the", "symptoms", "of", "diabetes", "?"]

# Embedding matrix (each row is a token's embedding vector of size 4)
E = np.array([
    [1.0, 0.2, 0.5, 0.7],  # "What"
    [0.6, 0.9, 0.4, 0.7],  # "are"
    [0.4, 0.3, 1.1, 0.6],  # "the"
    [1.1, 1.0, 0.7, 0.8],  # "symptoms"
    [0.7, 0.8, 0.6, 0.5],  # "of"
    [1.5, 1.2, 1.4, 0.9],  # "diabetes"
    [0.8, 1.0, 1.3, 1.6]   # "?"
])

# Positional encoding
pos = np.arange(7)[:, np.newaxis]  # Positions: [0, 1, 2, ..., 6]
div_term = np.power(10000, 2 * np.arange(2) / 4)
pe = np.zeros((7, 4))
pe[:, 0::2] = np.sin(pos / div_term)
pe[:, 1::2] = np.cos(pos / div_term)

# Input representation after adding positional encoding
X = E + pe

# Query, Key, and Value weight matrices
W_q = np.array([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
    [1.3, 1.4, 1.5, 1.6]
])

W_k = np.array([
    [0.2, 0.3, 0.4, 0.5],
    [0.6, 0.7, 0.8, 0.9],
    [1.0, 1.1, 1.2, 1.3],
    [1.4, 1.5, 1.6, 1.7]
])

W_v = np.array([
    [0.3, 0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9, 1.0],
    [1.1, 1.2, 1.3, 1.4],
    [1.5, 1.6, 1.7, 1.8]
])

# Compute Queries (Q), Keys (K), and Values (V)
Q = X @ W_q  # Shape: (7, 4)
K = X @ W_k  # Shape: (7, 4)
V = X @ W_v  # Shape: (7, 4)

# Compute Attention Scores (Softmax(QK^T / sqrt(d_k)))
d_k = 4  # Scaling factor (dimension of key vectors)
scores = Q @ K.T / np.sqrt(d_k)  # Raw attention scores (before softmax)
attention_scores = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)  # Softmax normalization

# Compute final output by multiplying attention scores with V
output = attention_scores @ V


# Print tokens
print("Tokens:", tokens)

# Print attention scores with corresponding tokens
print("\nAttention Scores (How much each token attends to others):")
for i, token in enumerate(tokens):
    print(f"{token}: {attention_scores[i]}")

# Print final output (contextualized embeddings)
print("\nFinal Output (Transformed embeddings after self-attention):")
for i, token in enumerate(tokens):
    print(f"{token}: {output[i]}")


Tokens: ['What', 'are', 'the', 'symptoms', 'of', 'diabetes', '?']

Attention Scores (How much each token attends to others):
What: [1.52530715e-10 1.04420713e-09 1.09310055e-11 6.22907524e-12
 3.19294245e-15 2.00522236e-05 9.99979947e-01]
are: [5.68112283e-11 4.24813834e-10 3.61933423e-12 2.01077412e-12
 7.33775331e-16 1.24883961e-05 9.99987511e-01]
the: [7.12635500e-10 4.27032529e-09 6.12001752e-11 3.62527436e-11
 3.12817775e-14 4.19508673e-05 9.99958044e-01]
symptoms: [9.73978239e-10 5.67894679e-09 8.67864132e-11 5.18178843e-11
 4.97265333e-14 4.87238192e-05 9.99951269e-01]
of: [7.48874940e-08 2.99489478e-07 1.11215922e-08 7.40884345e-09
 3.09542754e-11 3.89809591e-04 9.99609797e-01]
diabetes: [7.67439809e-14 9.97370805e-13 2.27571079e-15 1.07465795e-15
 4.36869080e-20 5.29943539e-07 9.99999470e-01]
?: [7.97000379e-17 1.85326524e-15 1.06258755e-18 4.23184540e-19
 1.73059012e-24 1.98005976e-08 9.99999980e-01]

Final Output (Transformed embeddings after self-attention):
What: [6.921530

In [5]:
import torch
import torch.nn.functional as F

# Sample tokenized input (7 words)
tokens = ["What", "are", "the", "symptoms", "of", "diabetes", "?"]

# Embedding matrix (each row is a token's embedding vector of size 4)
E = torch.tensor([
    [1.0, 0.2, 0.5, 0.7],  # "What"
    [0.6, 0.9, 0.4, 0.7],  # "are"
    [0.4, 0.3, 1.1, 0.6],  # "the"
    [1.1, 1.0, 0.7, 0.8],  # "symptoms"
    [0.7, 0.8, 0.6, 0.5],  # "of"
    [1.5, 1.2, 1.4, 0.9],  # "diabetes"
    [0.8, 1.0, 1.3, 1.6]   # "?"
], dtype=torch.float32)

# Positional encoding
pos = torch.arange(7).unsqueeze(1)  # Positions: [0, 1, 2, ..., 6]
div_term = torch.pow(10000, 2 * torch.arange(2, dtype=torch.float32) / 4)
pe = torch.zeros((7, 4))
pe[:, 0::2] = torch.sin(pos / div_term)
pe[:, 1::2] = torch.cos(pos / div_term)

# Input representation after adding positional encoding
X = E + pe

# Query, Key, and Value weight matrices
W_q = torch.tensor([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
    [1.3, 1.4, 1.5, 1.6]
], dtype=torch.float32)

W_k = torch.tensor([
    [0.2, 0.3, 0.4, 0.5],
    [0.6, 0.7, 0.8, 0.9],
    [1.0, 1.1, 1.2, 1.3],
    [1.4, 1.5, 1.6, 1.7]
], dtype=torch.float32)

W_v = torch.tensor([
    [0.3, 0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9, 1.0],
    [1.1, 1.2, 1.3, 1.4],
    [1.5, 1.6, 1.7, 1.8]
], dtype=torch.float32)

# Compute Queries (Q), Keys (K), and Values (V)
Q = X @ W_q  # Shape: (7, 4)
K = X @ W_k  # Shape: (7, 4)
V = X @ W_v  # Shape: (7, 4)

# Compute Attention Scores (Softmax(QK^T / sqrt(d_k)))
d_k = 4  # Scaling factor (dimension of key vectors)
scores = (Q @ K.T) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))  # Raw attention scores
attention_scores = F.softmax(scores, dim=-1)  # Apply softmax normalization

# Compute final output by multiplying attention scores with V
output = attention_scores @ V

# Print tokens
print("Tokens:", tokens)

# Print attention scores with corresponding tokens
print("\nAttention Scores (How much each token attends to others):")
for i, token in enumerate(tokens):
    print(f"{token}: {attention_scores[i].tolist()}")

# Print final output (contextualized embeddings)
print("\nFinal Output (Transformed embeddings after self-attention):")
for i, token in enumerate(tokens):
    print(f"{token}: {output[i].tolist()}")

Tokens: ['What', 'are', 'the', 'symptoms', 'of', 'diabetes', '?']

Attention Scores (How much each token attends to others):
What: [1.5253090457356677e-10, 1.0442061659787782e-09, 1.0930993089852681e-11, 6.2290758488792175e-12, 3.1929368579707105e-15, 2.005222995649092e-05, 0.9999799728393555]
are: [5.681146525438052e-11, 4.248123797179204e-10, 3.6193326981293072e-12, 2.010776744762466e-12, 7.33775430723099e-16, 1.2488432730606291e-05, 0.9999874830245972]
the: [7.126363388287871e-10, 4.270324538424575e-09, 6.120032952638965e-11, 3.6252733454089636e-11, 3.128176022252141e-14, 4.195092697045766e-05, 0.9999580383300781]
symptoms: [9.739811179798608e-10, 5.678931103147988e-09, 8.678639751291684e-11, 5.1817782703578175e-11, 4.972644475007004e-14, 4.872378121945076e-05, 0.9999512434005737]
of: [7.488761610829897e-08, 2.9948901669740735e-07, 1.112158543037367e-08, 7.408841096179231e-09, 3.0954263458005116e-11, 0.0003898085851687938, 0.9996097683906555]
diabetes: [7.67443969701756e-14, 9.97368